mahout#241"

apalumbo Fri, 10 Jun 2016 09:53:27 -0700

http://git-wip-us.apache.org/repos/asf/mahout/blob/7ae549fa/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/iterative_operations.hpp
----------------------------------------------------------------------
diff --git 
a/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/iterative_operations.hpp 
b/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/iterative_operations.hpp
deleted file mode 100644
index 4330431..0000000
--- a/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/iterative_operations.hpp
+++ /dev/null
@@ -1,2049 +0,0 @@
-#ifndef VIENNACL_LINALG_CUDA_ITERATIVE_OPERATIONS_HPP_
-#define VIENNACL_LINALG_CUDA_ITERATIVE_OPERATIONS_HPP_
-
-/* =========================================================================
-   Copyright (c) 2010-2016, Institute for Microelectronics,
-                            Institute for Analysis and Scientific Computing,
-                            TU Wien.
-   Portions of this software are copyright by UChicago Argonne, LLC.
-
-                            -----------------
-                  ViennaCL - The Vienna Computing Library
-                            -----------------
-
-   Project Head:    Karl Rupp                   [email protected]
-
-   (A list of authors and contributors can be found in the manual)
-
-   License:         MIT (X11), see file LICENSE in the base directory
-============================================================================= 
*/
-
-/** @file viennacl/linalg/cuda/iterative_operations.hpp
-    @brief Implementations of operations using sparse matrices using CUDA
-*/
-
-#include "viennacl/forwards.h"
-#include "viennacl/scalar.hpp"
-#include "viennacl/vector.hpp"
-#include "viennacl/tools/tools.hpp"
-#include "viennacl/linalg/cuda/common.hpp"
-
-namespace viennacl
-{
-namespace linalg
-{
-namespace cuda
-{
-
-//
-// CG vector update:
-//
-
-// cpu scalar
-template<typename NumericT>
-__global__ void pipelined_cg_vector_kernel(NumericT * result,
-                                           NumericT alpha,
-                                           NumericT * p,
-                                           NumericT * r,
-                                           NumericT const * Ap,
-                                           NumericT beta,
-                                           NumericT * inner_prod_buffer,
-                                           unsigned int size)
-{
-  NumericT inner_prod_contrib = 0;
-  for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size; i += 
gridDim.x * blockDim.x)
-  {
-    NumericT value_p = p[i];
-    NumericT value_r = r[i];
-
-    result[i] += alpha * value_p;
-    value_r   -= alpha * Ap[i];
-    value_p    = value_r + beta * value_p;
-
-    p[i] = value_p;
-    r[i] = value_r;
-    inner_prod_contrib += value_r * value_r;
-  }
-
-  // parallel reduction in work group
-  __shared__ NumericT shared_array[256];
-  shared_array[threadIdx.x] = inner_prod_contrib;
-  for (unsigned int stride=blockDim.x/2; stride > 0; stride /= 2)
-  {
-    __syncthreads();
-    if (threadIdx.x < stride)
-      shared_array[threadIdx.x] += shared_array[threadIdx.x + stride];
-  }
-
-  // write results to result array
-  if (threadIdx.x == 0)
-    inner_prod_buffer[blockIdx.x] = shared_array[0];
-}
-
-
-template<typename NumericT>
-void pipelined_cg_vector_update(vector_base<NumericT> & result,
-                                NumericT alpha,
-                                vector_base<NumericT> & p,
-                                vector_base<NumericT> & r,
-                                vector_base<NumericT> const & Ap,
-                                NumericT beta,
-                                vector_base<NumericT> & inner_prod_buffer)
-{
-  unsigned int size = result.size();
-  pipelined_cg_vector_kernel<<<128, 128>>>(viennacl::cuda_arg(result),
-                                           alpha,
-                                           viennacl::cuda_arg(p),
-                                           viennacl::cuda_arg(r),
-                                           viennacl::cuda_arg(Ap),
-                                           beta,
-                                           
viennacl::cuda_arg(inner_prod_buffer),
-                                           size);
-  VIENNACL_CUDA_LAST_ERROR_CHECK("pipelined_cg_vector_kernel");
-}
-
-
-
-
-//
-// Compressed matrix
-//
-
-
-template<unsigned int SubWarpSizeV, typename NumericT>
-__global__ void pipelined_cg_csr_vec_mul_blocked_kernel(
-          const unsigned int * row_indices,
-          const unsigned int * column_indices,
-          const NumericT * elements,
-          const NumericT * p,
-          NumericT * Ap,
-          unsigned int size,
-          NumericT * inner_prod_buffer,
-          unsigned int buffer_size)
-{
-  __shared__ NumericT shared_elements[256];
-  NumericT inner_prod_ApAp = 0;
-  NumericT inner_prod_pAp = 0;
-
-  const unsigned int id_in_row = threadIdx.x % SubWarpSizeV;
-  const unsigned int block_increment = blockDim.x * ((size - 1) / (gridDim.x * 
blockDim.x) + 1);
-  const unsigned int block_start = blockIdx.x * block_increment;
-  const unsigned int block_stop  = min(block_start + block_increment, size);
-
-  for (unsigned int row  = block_start + threadIdx.x / SubWarpSizeV;
-                    row  < block_stop;
-                    row += blockDim.x / SubWarpSizeV)
-  {
-    NumericT dot_prod = NumericT(0);
-    unsigned int row_end = row_indices[row+1];
-    for (unsigned int i = row_indices[row] + id_in_row; i < row_end; i += 
SubWarpSizeV)
-      dot_prod += elements[i] * p[column_indices[i]];
-
-    shared_elements[threadIdx.x] = dot_prod;
-    if (1  < SubWarpSizeV) shared_elements[threadIdx.x] += 
shared_elements[threadIdx.x ^  1];
-    if (2  < SubWarpSizeV) shared_elements[threadIdx.x] += 
shared_elements[threadIdx.x ^  2];
-    if (4  < SubWarpSizeV) shared_elements[threadIdx.x] += 
shared_elements[threadIdx.x ^  4];
-    if (8  < SubWarpSizeV) shared_elements[threadIdx.x] += 
shared_elements[threadIdx.x ^  8];
-    if (16 < SubWarpSizeV) shared_elements[threadIdx.x] += 
shared_elements[threadIdx.x ^ 16];
-
-    if (id_in_row == 0)
-    {
-      Ap[row] = shared_elements[threadIdx.x];
-      inner_prod_ApAp += shared_elements[threadIdx.x] * 
shared_elements[threadIdx.x];
-      inner_prod_pAp  +=                       p[row] * 
shared_elements[threadIdx.x];
-    }
-  }
-
-  ////////// parallel reduction in work group
-  __shared__ NumericT shared_array_ApAp[256];
-  __shared__ NumericT shared_array_pAp[256];
-  shared_array_ApAp[threadIdx.x] = inner_prod_ApAp;
-  shared_array_pAp[threadIdx.x]  = inner_prod_pAp;
-  for (unsigned int stride=blockDim.x/2; stride > 0; stride /= 2)
-  {
-    __syncthreads();
-    if (threadIdx.x < stride)
-    {
-      shared_array_ApAp[threadIdx.x] += shared_array_ApAp[threadIdx.x + 
stride];
-      shared_array_pAp[threadIdx.x]  += shared_array_pAp[threadIdx.x + stride];
-    }
-  }
-
-  // write results to result array
-  if (threadIdx.x == 0) {
-    inner_prod_buffer[  buffer_size + blockIdx.x] = shared_array_ApAp[0];
-    inner_prod_buffer[2*buffer_size + blockIdx.x] = shared_array_pAp[0];
-  }
-
-}
-
-template<typename NumericT>
-__global__ void pipelined_cg_csr_vec_mul_adaptive_kernel(
-          const unsigned int * row_indices,
-          const unsigned int * column_indices,
-          const unsigned int * row_blocks,
-          const NumericT * elements,
-          unsigned int num_blocks,
-          const NumericT * p,
-          NumericT * Ap,
-          unsigned int size,
-          NumericT * inner_prod_buffer,
-          unsigned int buffer_size)
-{
-  NumericT inner_prod_ApAp = 0;
-  NumericT inner_prod_pAp = 0;
-
-  __shared__ NumericT     shared_elements[1024];
-
-  for (unsigned int block_id = blockIdx.x; block_id < num_blocks; block_id += 
gridDim.x)
-  {
-    unsigned int row_start = row_blocks[block_id];
-    unsigned int row_stop  = row_blocks[block_id + 1];
-    unsigned int element_start = row_indices[row_start];
-    unsigned int element_stop = row_indices[row_stop];
-    unsigned int rows_to_process = row_stop - row_start;
-
-    if (rows_to_process > 1)  // CSR stream with one thread per row
-    {
-      // load to shared buffer:
-      for (unsigned int i = element_start + threadIdx.x; i < element_stop; i 
+= blockDim.x)
-        shared_elements[i - element_start] = elements[i] * 
p[column_indices[i]];
-
-      __syncthreads();
-
-      // use one thread per row to sum:
-      for (unsigned int row = row_start + threadIdx.x; row < row_stop; row += 
blockDim.x)
-      {
-        NumericT dot_prod = 0;
-        unsigned int thread_row_start = row_indices[row]     - element_start;
-        unsigned int thread_row_stop  = row_indices[row + 1] - element_start;
-        for (unsigned int i = thread_row_start; i < thread_row_stop; ++i)
-          dot_prod += shared_elements[i];
-        Ap[row] = dot_prod;
-        inner_prod_ApAp += dot_prod * dot_prod;
-        inner_prod_pAp  +=   p[row] * dot_prod;
-      }
-    }
-    // TODO here: Consider CSR vector for two to four rows (cf. OpenCL 
implementation. Experience on Fermi suggests that this may not be necessary)
-    else // CSR vector for a single row
-    {
-      // load and sum to shared buffer:
-      shared_elements[threadIdx.x] = 0;
-      for (unsigned int i = element_start + threadIdx.x; i < element_stop; i 
+= blockDim.x)
-        shared_elements[threadIdx.x] += elements[i] * p[column_indices[i]];
-
-      // reduction to obtain final result
-      for (unsigned int stride = blockDim.x/2; stride > 0; stride /= 2)
-      {
-        __syncthreads();
-        if (threadIdx.x < stride)
-          shared_elements[threadIdx.x] += shared_elements[threadIdx.x+stride];
-      }
-
-      if (threadIdx.x == 0)
-      {
-        Ap[row_start] = shared_elements[0];
-        inner_prod_ApAp += shared_elements[0] * shared_elements[0];
-        inner_prod_pAp  +=       p[row_start] * shared_elements[0];
-      }
-    }
-
-    __syncthreads();  // avoid race conditions
-  }
-
-  ////////// parallel reduction in work group
-  __shared__ NumericT shared_array_ApAp[256];
-  __shared__ NumericT shared_array_pAp[256];
-  shared_array_ApAp[threadIdx.x] = inner_prod_ApAp;
-  shared_array_pAp[threadIdx.x]  = inner_prod_pAp;
-  for (unsigned int stride=blockDim.x/2; stride > 0; stride /= 2)
-  {
-    __syncthreads();
-    if (threadIdx.x < stride)
-    {
-      shared_array_ApAp[threadIdx.x] += shared_array_ApAp[threadIdx.x + 
stride];
-      shared_array_pAp[threadIdx.x]  += shared_array_pAp[threadIdx.x + stride];
-    }
-  }
-
-  // write results to result array
-  if (threadIdx.x == 0) {
-    inner_prod_buffer[  buffer_size + blockIdx.x] = shared_array_ApAp[0];
-    inner_prod_buffer[2*buffer_size + blockIdx.x] = shared_array_pAp[0];
-  }
-}
-
-
-
-
-template<typename NumericT>
-void pipelined_cg_prod(compressed_matrix<NumericT> const & A,
-                       vector_base<NumericT> const & p,
-                       vector_base<NumericT> & Ap,
-                       vector_base<NumericT> & inner_prod_buffer)
-{
-  unsigned int size = p.size();
-  unsigned int buffer_size_per_vector = static_cast<unsigned 
int>(inner_prod_buffer.size()) / static_cast<unsigned int>(3);
-
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 500
-  if (double(A.nnz()) / double(A.size1()) > 6.4) // less than 10% of threads 
expected to idle
-  {
-    pipelined_cg_csr_vec_mul_blocked_kernel<8,  NumericT><<<256, 256>>>(   // 
experience on a GTX 750 Ti suggests that 8 is a substantially better choice here
-#else
-  if (double(A.nnz()) / double(A.size1()) > 12.0) // less than 25% of threads 
expected to idle
-  {
-    pipelined_cg_csr_vec_mul_blocked_kernel<16, NumericT><<<256, 256>>>(   // 
Fermi and Kepler prefer 16 threads per row (half-warp)
-#endif
-                                                                        
viennacl::cuda_arg<unsigned int>(A.handle1()),
-                                                                        
viennacl::cuda_arg<unsigned int>(A.handle2()),
-                                                                        
viennacl::cuda_arg<NumericT>(A.handle()),
-                                                                        
viennacl::cuda_arg(p),
-                                                                        
viennacl::cuda_arg(Ap),
-                                                                        size,
-                                                                        
viennacl::cuda_arg(inner_prod_buffer),
-                                                                        
buffer_size_per_vector
-                                                                       );
-    VIENNACL_CUDA_LAST_ERROR_CHECK("pipelined_cg_csr_vec_mul_blocked_kernel");
-  }
-  else
-  {
-    pipelined_cg_csr_vec_mul_adaptive_kernel<<<256, 
256>>>(viennacl::cuda_arg<unsigned int>(A.handle1()),
-                                                           
viennacl::cuda_arg<unsigned int>(A.handle2()),
-                                                           
viennacl::cuda_arg<unsigned int>(A.handle3()),
-                                                           
viennacl::cuda_arg<NumericT>(A.handle()),
-                                                           
static_cast<unsigned int>(A.blocks1()),
-                                                           
viennacl::cuda_arg(p),
-                                                           
viennacl::cuda_arg(Ap),
-                                                           size,
-                                                           
viennacl::cuda_arg(inner_prod_buffer),
-                                                           
buffer_size_per_vector);
-    VIENNACL_CUDA_LAST_ERROR_CHECK("pipelined_cg_csr_vec_mul_kernel");
-  }
-}
-
-
-//
-// Coordinate Matrix
-//
-
-
-template<typename NumericT>
-__global__ void pipelined_cg_coo_vec_mul_kernel(const unsigned int * coords, 
//(row_index, column_index)
-                                                const NumericT * elements,
-                                                const unsigned int * 
group_boundaries,
-                                                const NumericT * p,
-                                                NumericT * Ap,
-                                                unsigned int size,
-                                                NumericT * inner_prod_buffer,
-                                                unsigned int buffer_size)
-{
-  NumericT inner_prod_ApAp = 0;
-  NumericT inner_prod_pAp  = 0;
-  __shared__ unsigned int shared_rows[128];
-  __shared__ NumericT inter_results[128];
-
-  uint2 tmp;
-  NumericT val;
-  unsigned int group_start = group_boundaries[blockIdx.x];
-  unsigned int group_end   = group_boundaries[blockIdx.x + 1];
-  unsigned int k_end = (group_end > group_start) ? 1 + (group_end - 
group_start - 1) / blockDim.x : 0;   // -1 in order to have correct behavior if 
group_end - group_start == j * blockDim.x
-
-  unsigned int local_index = 0;
-
-  for (unsigned int k = 0; k < k_end; ++k)
-  {
-    local_index = group_start + k * blockDim.x + threadIdx.x;
-
-    tmp = (local_index < group_end) ? ((const uint2 *)coords)[local_index] : 
::make_uint2(0, 0);
-    val = (local_index < group_end) ? elements[local_index] * p[tmp.y] : 0;
-
-    //check for carry from previous loop run:
-    if (threadIdx.x == 0 && k > 0)
-    {
-      if (tmp.x == shared_rows[blockDim.x-1])
-        val += inter_results[blockDim.x-1];
-      else
-      {
-        NumericT Ap_entry = inter_results[blockDim.x-1];
-        Ap[shared_rows[blockDim.x-1]] = Ap_entry;
-        inner_prod_ApAp += Ap_entry * Ap_entry;
-        inner_prod_pAp  += Ap_entry * p[shared_rows[blockDim.x-1]];
-      }
-    }
-
-    //segmented parallel reduction begin
-    __syncthreads();
-    shared_rows[threadIdx.x] = tmp.x;
-    inter_results[threadIdx.x] = val;
-    NumericT left = 0;
-    __syncthreads();
-
-    for (unsigned int stride = 1; stride < blockDim.x; stride *= 2)
-    {
-      left = (threadIdx.x >= stride && tmp.x == shared_rows[threadIdx.x - 
stride]) ? inter_results[threadIdx.x - stride] : 0;
-      __syncthreads();
-      inter_results[threadIdx.x] += left;
-      __syncthreads();
-    }
-    //segmented parallel reduction end
-
-    if (local_index < group_end && threadIdx.x < blockDim.x-1 &&
-        shared_rows[threadIdx.x] != shared_rows[threadIdx.x + 1])
-    {
-      NumericT Ap_entry = inter_results[threadIdx.x];
-      Ap[tmp.x] = Ap_entry;
-      inner_prod_ApAp += Ap_entry * Ap_entry;
-      inner_prod_pAp  += Ap_entry * p[tmp.x];
-    }
-
-    __syncthreads();
-  } //for k
-
-  if (local_index + 1 == group_end)
-  {
-    NumericT Ap_entry = inter_results[threadIdx.x];
-    Ap[tmp.x] = Ap_entry;
-    inner_prod_ApAp += Ap_entry * Ap_entry;
-    inner_prod_pAp  += Ap_entry * p[tmp.x];
-  }
-
-  ////////// parallel reduction in work group
-  __shared__ NumericT shared_array_ApAp[256];
-  __shared__ NumericT shared_array_pAp[256];
-  shared_array_ApAp[threadIdx.x] = inner_prod_ApAp;
-  shared_array_pAp[threadIdx.x]  = inner_prod_pAp;
-  for (unsigned int stride=blockDim.x/2; stride > 0; stride /= 2)
-  {
-    __syncthreads();
-    if (threadIdx.x < stride)
-    {
-      shared_array_ApAp[threadIdx.x] += shared_array_ApAp[threadIdx.x + 
stride];
-      shared_array_pAp[threadIdx.x]  += shared_array_pAp[threadIdx.x + stride];
-    }
-  }
-
-  // write results to result array
-  if (threadIdx.x == 0) {
-    inner_prod_buffer[  buffer_size + blockIdx.x] = shared_array_ApAp[0];
-    inner_prod_buffer[2*buffer_size + blockIdx.x] = shared_array_pAp[0];
-  }
-
-}
-
-
-template<typename NumericT>
-void pipelined_cg_prod(coordinate_matrix<NumericT> const & A,
-                       vector_base<NumericT> const & p,
-                       vector_base<NumericT> & Ap,
-                       vector_base<NumericT> & inner_prod_buffer)
-{
-  unsigned int size = p.size();
-  unsigned int buffer_size_per_vector = static_cast<unsigned 
int>(inner_prod_buffer.size()) / static_cast<unsigned int>(3);
-
-  Ap.clear();
-
-  pipelined_cg_coo_vec_mul_kernel<<<64, 128>>>(viennacl::cuda_arg<unsigned 
int>(A.handle12()),
-                                                
viennacl::cuda_arg<NumericT>(A.handle()),
-                                                viennacl::cuda_arg<unsigned 
int>(A.handle3()),
-                                                viennacl::cuda_arg(p),
-                                                viennacl::cuda_arg(Ap),
-                                                size,
-                                                
viennacl::cuda_arg(inner_prod_buffer),
-                                                buffer_size_per_vector);
-  VIENNACL_CUDA_LAST_ERROR_CHECK("pipelined_cg_coo_vec_mul_kernel");
-}
-
-
-
-//
-// ELL Matrix
-//
-
-template<typename NumericT>
-__global__ void pipelined_cg_ell_vec_mul_kernel(const unsigned int * coords,
-                                                const NumericT * elements,
-                                                unsigned int internal_row_num,
-                                                unsigned int items_per_row,
-                                                const NumericT * p,
-                                                NumericT * Ap,
-                                                unsigned int size,
-                                                NumericT * inner_prod_buffer,
-                                                unsigned int buffer_size)
-{
-  NumericT inner_prod_ApAp = 0;
-  NumericT inner_prod_pAp  = 0;
-  unsigned int glb_id = blockDim.x * blockIdx.x + threadIdx.x;
-  unsigned int glb_sz = gridDim.x * blockDim.x;
-
-  for (unsigned int row = glb_id; row < size; row += glb_sz)
-  {
-    NumericT sum = 0;
-
-    unsigned int offset = row;
-    for (unsigned int item_id = 0; item_id < items_per_row; item_id++, offset 
+= internal_row_num)
-    {
-      NumericT val = elements[offset];
-      sum += val ? p[coords[offset]] * val : NumericT(0);
-    }
-
-    Ap[row] = sum;
-    inner_prod_ApAp += sum * sum;
-    inner_prod_pAp  += sum * p[row];
-  }
-
-  ////////// parallel reduction in work group
-  __shared__ NumericT shared_array_ApAp[256];
-  __shared__ NumericT shared_array_pAp[256];
-  shared_array_ApAp[threadIdx.x] = inner_prod_ApAp;
-  shared_array_pAp[threadIdx.x]  = inner_prod_pAp;
-  for (unsigned int stride=blockDim.x/2; stride > 0; stride /= 2)
-  {
-    __syncthreads();
-    if (threadIdx.x < stride)
-    {
-      shared_array_ApAp[threadIdx.x] += shared_array_ApAp[threadIdx.x + 
stride];
-      shared_array_pAp[threadIdx.x]  += shared_array_pAp[threadIdx.x + stride];
-    }
-  }
-
-  // write results to result array
-  if (threadIdx.x == 0) {
-    inner_prod_buffer[  buffer_size + blockIdx.x] = shared_array_ApAp[0];
-    inner_prod_buffer[2*buffer_size + blockIdx.x] = shared_array_pAp[0];
-  }
-}
-
-
-template<typename NumericT>
-void pipelined_cg_prod(ell_matrix<NumericT> const & A,
-                       vector_base<NumericT> const & p,
-                       vector_base<NumericT> & Ap,
-                       vector_base<NumericT> & inner_prod_buffer)
-{
-  unsigned int size = p.size();
-  unsigned int buffer_size_per_vector = static_cast<unsigned 
int>(inner_prod_buffer.size()) / static_cast<unsigned int>(3);
-
-  pipelined_cg_ell_vec_mul_kernel<<<256, 256>>>(viennacl::cuda_arg<unsigned 
int>(A.handle2()),
-                                                
viennacl::cuda_arg<NumericT>(A.handle()),
-                                                static_cast<unsigned 
int>(A.internal_size1()),
-                                                static_cast<unsigned 
int>(A.maxnnz()),
-                                                viennacl::cuda_arg(p),
-                                                viennacl::cuda_arg(Ap),
-                                                size,
-                                                
viennacl::cuda_arg(inner_prod_buffer),
-                                                buffer_size_per_vector);
-  VIENNACL_CUDA_LAST_ERROR_CHECK("pipelined_cg_ell_vec_mul_kernel");
-}
-
-
-//
-// SELL-C-\sigma Matrix
-//
-
-template<typename NumericT>
-__global__ void pipelined_cg_sliced_ell_vec_mul_kernel(const unsigned int * 
columns_per_block,
-                                                       const unsigned int * 
column_indices,
-                                                       const unsigned int * 
block_start,
-                                                       const NumericT * 
elements,
-                                                       const NumericT * p,
-                                                       NumericT * Ap,
-                                                       unsigned int size,
-                                                       unsigned int block_size,
-                                                       NumericT * 
inner_prod_buffer,
-                                                       unsigned int 
buffer_size)
-{
-  NumericT inner_prod_ApAp = 0;
-  NumericT inner_prod_pAp  = 0;
-
-  unsigned int blocks_per_threadblock = blockDim.x / block_size;
-  unsigned int id_in_block = threadIdx.x % block_size;
-  unsigned int num_blocks = (size - 1) / block_size + 1;
-  unsigned int global_warp_count = blocks_per_threadblock * gridDim.x;
-  unsigned int global_warp_id = blocks_per_threadblock * blockIdx.x + 
threadIdx.x / block_size;
-
-  for (unsigned int block_idx = global_warp_id; block_idx < num_blocks; 
block_idx += global_warp_count)
-  {
-    unsigned int row         = block_idx * block_size + id_in_block;
-    unsigned int offset      = block_start[block_idx];
-    unsigned int num_columns = columns_per_block[block_idx];
-
-    NumericT sum = 0;
-    for (unsigned int item_id = 0; item_id < num_columns; item_id++)
-    {
-      unsigned int index = offset + item_id * block_size + id_in_block;
-      NumericT val = elements[index];
-
-      sum += val ? (p[column_indices[index]] * val) : 0;
-    }
-
-    if (row < size)
-    {
-      Ap[row] = sum;
-      inner_prod_ApAp += sum * sum;
-      inner_prod_pAp  += sum * p[row];
-    }
-  }
-
-  ////////// parallel reduction in work group
-  __shared__ NumericT shared_array_ApAp[256];
-  __shared__ NumericT shared_array_pAp[256];
-  shared_array_ApAp[threadIdx.x] = inner_prod_ApAp;
-  shared_array_pAp[threadIdx.x]  = inner_prod_pAp;
-  for (unsigned int stride=blockDim.x/2; stride > 0; stride /= 2)
-  {
-    __syncthreads();
-    if (threadIdx.x < stride)
-    {
-      shared_array_ApAp[threadIdx.x] += shared_array_ApAp[threadIdx.x + 
stride];
-      shared_array_pAp[threadIdx.x]  += shared_array_pAp[threadIdx.x + stride];
-    }
-  }
-
-  // write results to result array
-  if (threadIdx.x == 0) {
-    inner_prod_buffer[  buffer_size + blockIdx.x] = shared_array_ApAp[0];
-    inner_prod_buffer[2*buffer_size + blockIdx.x] = shared_array_pAp[0];
-  }
-}
-
-template<typename NumericT>
-void pipelined_cg_prod(sliced_ell_matrix<NumericT> const & A,
-                       vector_base<NumericT> const & p,
-                       vector_base<NumericT> & Ap,
-                       vector_base<NumericT> & inner_prod_buffer)
-{
-  unsigned int size = p.size();
-  unsigned int buffer_size_per_vector = static_cast<unsigned 
int>(inner_prod_buffer.size()) / static_cast<unsigned int>(3);
-
-  pipelined_cg_sliced_ell_vec_mul_kernel<<<256, 
256>>>(viennacl::cuda_arg<unsigned int>(A.handle1()),
-                                                       
viennacl::cuda_arg<unsigned int>(A.handle2()),
-                                                       
viennacl::cuda_arg<unsigned int>(A.handle3()),
-                                                       
viennacl::cuda_arg<NumericT>(A.handle()),
-                                                       viennacl::cuda_arg(p),
-                                                       viennacl::cuda_arg(Ap),
-                                                       size,
-                                                       static_cast<unsigned 
int>(A.rows_per_block()),
-                                                       
viennacl::cuda_arg(inner_prod_buffer),
-                                                       buffer_size_per_vector);
-  VIENNACL_CUDA_LAST_ERROR_CHECK("pipelined_cg_sliced_ell_vec_mul_kernel");
-}
-
-
-//
-// Hybrid Matrix
-//
-
-
-template<typename NumericT>
-__global__ void pipelined_cg_hyb_vec_mul_kernel(const unsigned int * 
ell_coords,
-                                                const NumericT * ell_elements,
-                                                const unsigned int * csr_rows,
-                                                const unsigned int * csr_cols,
-                                                const NumericT * csr_elements,
-                                                unsigned int internal_row_num,
-                                                unsigned int items_per_row,
-                                                const NumericT * p,
-                                                NumericT * Ap,
-                                                unsigned int size,
-                                                NumericT * inner_prod_buffer,
-                                                unsigned int buffer_size)
-{
-  NumericT inner_prod_ApAp = 0;
-  NumericT inner_prod_pAp  = 0;
-  unsigned int glb_id = blockDim.x * blockIdx.x + threadIdx.x;
-  unsigned int glb_sz = gridDim.x * blockDim.x;
-
-  for (unsigned int row = glb_id; row < size; row += glb_sz)
-  {
-    NumericT sum = 0;
-
-    unsigned int offset = row;
-    for (unsigned int item_id = 0; item_id < items_per_row; item_id++, offset 
+= internal_row_num)
-    {
-      NumericT val = ell_elements[offset];
-
-      sum += val ? p[ell_coords[offset]] * val : NumericT(0);
-    }
-
-    unsigned int col_begin = csr_rows[row];
-    unsigned int col_end   = csr_rows[row + 1];
-
-    for (unsigned int item_id = col_begin; item_id < col_end; item_id++)
-    {
-      sum += p[csr_cols[item_id]] * csr_elements[item_id];
-    }
-
-    Ap[row] = sum;
-    inner_prod_ApAp += sum * sum;
-    inner_prod_pAp  += sum * p[row];
-  }
-
-  ////////// parallel reduction in work group
-  __shared__ NumericT shared_array_ApAp[256];
-  __shared__ NumericT shared_array_pAp[256];
-  shared_array_ApAp[threadIdx.x] = inner_prod_ApAp;
-  shared_array_pAp[threadIdx.x]  = inner_prod_pAp;
-  for (unsigned int stride=blockDim.x/2; stride > 0; stride /= 2)
-  {
-    __syncthreads();
-    if (threadIdx.x < stride)
-    {
-      shared_array_ApAp[threadIdx.x] += shared_array_ApAp[threadIdx.x + 
stride];
-      shared_array_pAp[threadIdx.x]  += shared_array_pAp[threadIdx.x + stride];
-    }
-  }
-
-  // write results to result array
-  if (threadIdx.x == 0) {
-    inner_prod_buffer[  buffer_size + blockIdx.x] = shared_array_ApAp[0];
-    inner_prod_buffer[2*buffer_size + blockIdx.x] = shared_array_pAp[0];
-  }
-}
-
-
-
-template<typename NumericT>
-void pipelined_cg_prod(hyb_matrix<NumericT> const & A,
-                       vector_base<NumericT> const & p,
-                       vector_base<NumericT> & Ap,
-                       vector_base<NumericT> & inner_prod_buffer)
-{
-  unsigned int size = p.size();
-  unsigned int buffer_size_per_vector = static_cast<unsigned 
int>(inner_prod_buffer.size()) / static_cast<unsigned int>(3);
-
-  pipelined_cg_hyb_vec_mul_kernel<<<256, 256>>>(viennacl::cuda_arg<unsigned 
int>(A.handle2()),
-                                                
viennacl::cuda_arg<NumericT>(A.handle()),
-                                                viennacl::cuda_arg<unsigned 
int>(A.handle3()),
-                                                viennacl::cuda_arg<unsigned 
int>(A.handle4()),
-                                                
viennacl::cuda_arg<NumericT>(A.handle5()),
-                                                static_cast<unsigned 
int>(A.internal_size1()),
-                                                static_cast<unsigned 
int>(A.ell_nnz()),
-                                                viennacl::cuda_arg(p),
-                                                viennacl::cuda_arg(Ap),
-                                                size,
-                                                
viennacl::cuda_arg(inner_prod_buffer),
-                                                buffer_size_per_vector);
-  VIENNACL_CUDA_LAST_ERROR_CHECK("pipelined_cg_hyb_vec_mul_kernel");
-}
-
-
-
-/////////////////////////////////////
-
-template<typename NumericT>
-__global__ void pipelined_bicgstab_update_s_kernel(NumericT * s,
-                                                   NumericT const * residual,
-                                                   NumericT const * Ap,
-                                                   unsigned int size,
-                                                   NumericT * 
inner_prod_buffer,
-                                                   unsigned int chunk_size,
-                                                   unsigned int chunk_offset)
-{
-  NumericT alpha = 0;
-
-  // parallel reduction in work group to compute <r, r0> / <Ap, r0>
-  __shared__ NumericT shared_array[256];
-  __shared__ NumericT shared_array_Ap_in_r0[256];
-
-  shared_array[threadIdx.x] = inner_prod_buffer[threadIdx.x];
-  shared_array_Ap_in_r0[threadIdx.x] = inner_prod_buffer[threadIdx.x + 3 * 
chunk_size];
-  for (unsigned int stride=blockDim.x/2; stride > 0; stride /= 2)
-  {
-    __syncthreads();
-    if (threadIdx.x < stride) {
-      shared_array[threadIdx.x]          += shared_array[threadIdx.x + stride];
-      shared_array_Ap_in_r0[threadIdx.x] += shared_array_Ap_in_r0[threadIdx.x 
+ stride];
-    }
-  }
-
-  // compute alpha from reduced values:
-  __syncthreads();
-  alpha = shared_array[0] / shared_array_Ap_in_r0[0];
-
-  // run vector update and compute first stage of <s, s>
-  NumericT inner_prod_contrib = 0;
-  for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size; i += 
gridDim.x * blockDim.x)
-  {
-    NumericT value_s = s[i];
-
-    value_s = residual[i] - alpha * Ap[i];
-    inner_prod_contrib += value_s * value_s;
-
-    s[i] = value_s;
-  }
-  __syncthreads();
-
-  // parallel reduction in work group
-  shared_array[threadIdx.x] = inner_prod_contrib;
-  for (unsigned int stride=blockDim.x/2; stride > 0; stride /= 2)
-  {
-    __syncthreads();
-    if (threadIdx.x < stride)
-      shared_array[threadIdx.x] += shared_array[threadIdx.x + stride];
-  }
-
-  // write results to inner_prod_buffer
-  if (threadIdx.x == 0)
-    inner_prod_buffer[blockIdx.x + chunk_offset] = shared_array[0];
-}
-
-template<typename NumericT>
-void pipelined_bicgstab_update_s(vector_base<NumericT> & s,
-                                 vector_base<NumericT> & r,
-                                 vector_base<NumericT> const & Ap,
-                                 vector_base<NumericT> & inner_prod_buffer,
-                                 vcl_size_t buffer_chunk_size,
-                                 vcl_size_t buffer_chunk_offset)
-{
-  unsigned int size = static_cast<unsigned int>(s.size());
-  unsigned int chunk_size   = static_cast<unsigned int>(buffer_chunk_size);
-  unsigned int chunk_offset = static_cast<unsigned int>(buffer_chunk_offset);
-
-  pipelined_bicgstab_update_s_kernel<<<256, 256>>>(viennacl::cuda_arg(s),
-                                                   viennacl::cuda_arg(r),
-                                                   viennacl::cuda_arg(Ap),
-                                                   size,
-                                                   
viennacl::cuda_arg(inner_prod_buffer),
-                                                   chunk_size,
-                                                   chunk_offset);
-  VIENNACL_CUDA_LAST_ERROR_CHECK("pipelined_bicgstab_update_s_kernel");
-}
-
-template<typename NumericT>
-__global__ void pipelined_bicgstab_vector_kernel(NumericT * result,
-                                                 NumericT alpha,
-                                                 NumericT * p,
-                                                 NumericT omega,
-                                                 NumericT const * s,
-                                                 NumericT * residual,
-                                                 NumericT const * As,
-                                                 NumericT beta,
-                                                 NumericT const * Ap,
-                                                 NumericT const * r0star,
-                                                 NumericT * inner_prod_buffer,
-                                                 unsigned int size)
-{
-  NumericT inner_prod_r_r0star = 0;
-  for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size; i += 
gridDim.x * blockDim.x)
-  {
-    NumericT value_result = result[i];
-    NumericT value_p = p[i];
-    NumericT value_s = s[i];
-    NumericT value_residual = residual[i];
-    NumericT value_As = As[i];
-    NumericT value_Ap = Ap[i];
-    NumericT value_r0star = r0star[i];
-
-    value_result   += alpha * value_p + omega * value_s;
-    value_residual  = value_s - omega * value_As;
-    value_p         = value_residual + beta * (value_p - omega * value_Ap);
-
-    result[i]   = value_result;
-    residual[i] = value_residual;
-    p[i]        = value_p;
-    inner_prod_r_r0star += value_residual * value_r0star;
-  }
-
-  // parallel reduction in work group
-  __shared__ NumericT shared_array[256];
-  shared_array[threadIdx.x] = inner_prod_r_r0star;
-  for (unsigned int stride=blockDim.x/2; stride > 0; stride /= 2)
-  {
-    __syncthreads();
-    if (threadIdx.x < stride)
-      shared_array[threadIdx.x] += shared_array[threadIdx.x + stride];
-  }
-
-  // write results to result array
-  if (threadIdx.x == 0)
-    inner_prod_buffer[blockIdx.x] = shared_array[0];
-}
-
-
-template<typename NumericT>
-void pipelined_bicgstab_vector_update(vector_base<NumericT> & result, NumericT 
alpha, vector_base<NumericT> & p, NumericT omega, vector_base<NumericT> const & 
s,
-                                      vector_base<NumericT> & residual, 
vector_base<NumericT> const & As,
-                                      NumericT beta, vector_base<NumericT> 
const & Ap,
-                                      vector_base<NumericT> const & r0star,
-                                      vector_base<NumericT> & 
inner_prod_buffer, vcl_size_t buffer_chunk_size)
-{
-  (void)buffer_chunk_size;
-  unsigned int size = static_cast<unsigned int>(result.size());
-
-  pipelined_bicgstab_vector_kernel<<<256, 256>>>(viennacl::cuda_arg(result),
-                                                 alpha,
-                                                 viennacl::cuda_arg(p),
-                                                 omega,
-                                                 viennacl::cuda_arg(s),
-                                                 viennacl::cuda_arg(residual),
-                                                 viennacl::cuda_arg(As),
-                                                 beta,
-                                                 viennacl::cuda_arg(Ap),
-                                                 viennacl::cuda_arg(r0star),
-                                                 
viennacl::cuda_arg(inner_prod_buffer),
-                                                 size);
-  VIENNACL_CUDA_LAST_ERROR_CHECK("pipelined_bicgstab_vector_kernel");
-}
-
-
-
-//
-// Compressed matrix
-//
-
-
-template<unsigned int SubWarpSizeV, typename NumericT>
-__global__ void pipelined_bicgstab_csr_vec_mul_blocked_kernel(
-          const unsigned int * row_indices,
-          const unsigned int * column_indices,
-          const NumericT * elements,
-          const NumericT * p,
-          NumericT * Ap,
-          const NumericT * r0star,
-          unsigned int size,
-          NumericT * inner_prod_buffer,
-          unsigned int buffer_size,
-          unsigned int buffer_offset)
-{
-  __shared__ NumericT shared_elements[256];
-  NumericT inner_prod_ApAp = 0;
-  NumericT inner_prod_pAp = 0;
-  NumericT inner_prod_r0Ap  = 0;
-
-  const unsigned int id_in_row = threadIdx.x % SubWarpSizeV;
-  const unsigned int block_increment = blockDim.x * ((size - 1) / (gridDim.x * 
blockDim.x) + 1);
-  const unsigned int block_start = blockIdx.x * block_increment;
-  const unsigned int block_stop  = min(block_start + block_increment, size);
-
-  for (unsigned int row  = block_start + threadIdx.x / SubWarpSizeV;
-                    row  < block_stop;
-                    row += blockDim.x / SubWarpSizeV)
-  {
-    NumericT dot_prod = NumericT(0);
-    unsigned int row_end = row_indices[row+1];
-    for (unsigned int i = row_indices[row] + id_in_row; i < row_end; i += 
SubWarpSizeV)
-      dot_prod += elements[i] * p[column_indices[i]];
-
-    shared_elements[threadIdx.x] = dot_prod;
-    if (1  < SubWarpSizeV) shared_elements[threadIdx.x] += 
shared_elements[threadIdx.x ^  1];
-    if (2  < SubWarpSizeV) shared_elements[threadIdx.x] += 
shared_elements[threadIdx.x ^  2];
-    if (4  < SubWarpSizeV) shared_elements[threadIdx.x] += 
shared_elements[threadIdx.x ^  4];
-    if (8  < SubWarpSizeV) shared_elements[threadIdx.x] += 
shared_elements[threadIdx.x ^  8];
-    if (16 < SubWarpSizeV) shared_elements[threadIdx.x] += 
shared_elements[threadIdx.x ^ 16];
-
-    if (id_in_row == 0)
-    {
-      Ap[row] = shared_elements[threadIdx.x];
-      inner_prod_ApAp += shared_elements[threadIdx.x] * 
shared_elements[threadIdx.x];
-      inner_prod_pAp  +=                       p[row] * 
shared_elements[threadIdx.x];
-      inner_prod_r0Ap +=                  r0star[row] * 
shared_elements[threadIdx.x];
-    }
-  }
-
-  ////////// parallel reduction in work group
-  __shared__ NumericT shared_array_ApAp[256];
-  __shared__ NumericT shared_array_pAp[256];
-  __shared__ NumericT shared_array_r0Ap[256];
-  shared_array_ApAp[threadIdx.x] = inner_prod_ApAp;
-  shared_array_pAp[threadIdx.x]  = inner_prod_pAp;
-  shared_array_r0Ap[threadIdx.x] = inner_prod_r0Ap;
-  for (unsigned int stride=blockDim.x/2; stride > 0; stride /= 2)
-  {
-    __syncthreads();
-    if (threadIdx.x < stride)
-    {
-      shared_array_ApAp[threadIdx.x] += shared_array_ApAp[threadIdx.x + 
stride];
-      shared_array_pAp[threadIdx.x]  += shared_array_pAp[threadIdx.x + stride];
-      shared_array_r0Ap[threadIdx.x] += shared_array_r0Ap[threadIdx.x + 
stride];
-    }
-  }
-
-  // write results to result array
-  if (threadIdx.x == 0) {
-    inner_prod_buffer[  buffer_size + blockIdx.x] = shared_array_ApAp[0];
-    inner_prod_buffer[2*buffer_size + blockIdx.x] = shared_array_pAp[0];
-    inner_prod_buffer[buffer_offset + blockIdx.x] = shared_array_r0Ap[0];
-  }
-
-}
-
-
-template<typename NumericT>
-__global__ void pipelined_bicgstab_csr_vec_mul_adaptive_kernel(
-          const unsigned int * row_indices,
-          const unsigned int * column_indices,
-          const unsigned int * row_blocks,
-          const NumericT * elements,
-          unsigned int num_blocks,
-          const NumericT * p,
-          NumericT * Ap,
-          const NumericT * r0star,
-          unsigned int size,
-          NumericT * inner_prod_buffer,
-          unsigned int buffer_size,
-          unsigned int buffer_offset)
-{
-  NumericT inner_prod_ApAp = 0;
-  NumericT inner_prod_pAp = 0;
-  NumericT inner_prod_r0Ap  = 0;
-
-  __shared__ NumericT     shared_elements[1024];
-
-  for (unsigned int block_id = blockIdx.x; block_id < num_blocks; block_id += 
gridDim.x)
-  {
-    unsigned int row_start = row_blocks[block_id];
-    unsigned int row_stop  = row_blocks[block_id + 1];
-    unsigned int element_start = row_indices[row_start];
-    unsigned int element_stop = row_indices[row_stop];
-    unsigned int rows_to_process = row_stop - row_start;
-
-    if (rows_to_process > 1)  // CSR stream with one thread per row
-    {
-      // load to shared buffer:
-      for (unsigned int i = element_start + threadIdx.x; i < element_stop; i 
+= blockDim.x)
-        shared_elements[i - element_start] = elements[i] * 
p[column_indices[i]];
-
-      __syncthreads();
-
-      // use one thread per row to sum:
-      for (unsigned int row = row_start + threadIdx.x; row < row_stop; row += 
blockDim.x)
-      {
-        NumericT dot_prod = 0;
-        unsigned int thread_row_start = row_indices[row]     - element_start;
-        unsigned int thread_row_stop  = row_indices[row + 1] - element_start;
-        for (unsigned int i = thread_row_start; i < thread_row_stop; ++i)
-          dot_prod += shared_elements[i];
-        Ap[row] = dot_prod;
-        inner_prod_ApAp += dot_prod * dot_prod;
-        inner_prod_pAp  +=   p[row] * dot_prod;
-        inner_prod_r0Ap += r0star[row] * dot_prod;
-      }
-    }
-    // TODO here: Consider CSR vector for two to four rows (cf. OpenCL 
implementation. Experience on Fermi suggests that this may not be necessary)
-    else // CSR vector for a single row
-    {
-      // load and sum to shared buffer:
-      shared_elements[threadIdx.x] = 0;
-      for (unsigned int i = element_start + threadIdx.x; i < element_stop; i 
+= blockDim.x)
-        shared_elements[threadIdx.x] += elements[i] * p[column_indices[i]];
-
-      // reduction to obtain final result
-      for (unsigned int stride = blockDim.x/2; stride > 0; stride /= 2)
-      {
-        __syncthreads();
-        if (threadIdx.x < stride)
-          shared_elements[threadIdx.x] += shared_elements[threadIdx.x+stride];
-      }
-
-      if (threadIdx.x == 0)
-      {
-        Ap[row_start] = shared_elements[0];
-        inner_prod_ApAp += shared_elements[0] * shared_elements[0];
-        inner_prod_pAp  +=       p[row_start] * shared_elements[0];
-        inner_prod_r0Ap +=  r0star[row_start] * shared_elements[0];
-      }
-    }
-
-    __syncthreads();  // avoid race conditions
-  }
-
-  ////////// parallel reduction in work group
-  __shared__ NumericT shared_array_ApAp[256];
-  __shared__ NumericT shared_array_pAp[256];
-  __shared__ NumericT shared_array_r0Ap[256];
-  shared_array_ApAp[threadIdx.x] = inner_prod_ApAp;
-  shared_array_pAp[threadIdx.x]  = inner_prod_pAp;
-  shared_array_r0Ap[threadIdx.x] = inner_prod_r0Ap;
-  for (unsigned int stride=blockDim.x/2; stride > 0; stride /= 2)
-  {
-    __syncthreads();
-    if (threadIdx.x < stride)
-    {
-      shared_array_ApAp[threadIdx.x] += shared_array_ApAp[threadIdx.x + 
stride];
-      shared_array_pAp[threadIdx.x]  += shared_array_pAp[threadIdx.x + stride];
-      shared_array_r0Ap[threadIdx.x] += shared_array_r0Ap[threadIdx.x + 
stride];
-    }
-  }
-
-  // write results to result array
-  if (threadIdx.x == 0) {
-    inner_prod_buffer[  buffer_size + blockIdx.x] = shared_array_ApAp[0];
-    inner_prod_buffer[2*buffer_size + blockIdx.x] = shared_array_pAp[0];
-    inner_prod_buffer[buffer_offset + blockIdx.x] = shared_array_r0Ap[0];
-  }
-}
-
-
-
-
-template<typename NumericT>
-void pipelined_bicgstab_prod(compressed_matrix<NumericT> const & A,
-                             vector_base<NumericT> const & p,
-                             vector_base<NumericT> & Ap,
-                             vector_base<NumericT> const & r0star,
-                             vector_base<NumericT> & inner_prod_buffer,
-                             vcl_size_t buffer_chunk_size,
-                             vcl_size_t buffer_chunk_offset)
-{
-  unsigned int vec_size     = static_cast<unsigned 
int>(viennacl::traits::size(p));
-  unsigned int chunk_size   = static_cast<unsigned int>(buffer_chunk_size);
-  unsigned int chunk_offset = static_cast<unsigned int>(buffer_chunk_offset);
-
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 500
-  if (double(A.nnz()) / double(A.size1()) > 6.4) // less than 10% of threads 
expected to idle
-  {
-    pipelined_bicgstab_csr_vec_mul_blocked_kernel<8,  NumericT><<<256, 256>>>( 
  // experience on a GTX 750 Ti suggests that 8 is a substantially better 
choice here
-#else
-  if (double(A.nnz()) / double(A.size1()) > 12.0) // less than 25% of threads 
expected to idle
-  {
-    pipelined_bicgstab_csr_vec_mul_blocked_kernel<16, NumericT><<<256, 256>>>( 
  // Fermi and Kepler prefer 16 threads per row (half-warp)
-#endif
-                                                                        
viennacl::cuda_arg<unsigned int>(A.handle1()),
-                                                                        
viennacl::cuda_arg<unsigned int>(A.handle2()),
-                                                                        
viennacl::cuda_arg<NumericT>(A.handle()),
-                                                                        
viennacl::cuda_arg(p),
-                                                                        
viennacl::cuda_arg(Ap),
-                                                                        
viennacl::cuda_arg(r0star),
-                                                                        
vec_size,
-                                                                        
viennacl::cuda_arg(inner_prod_buffer),
-                                                                        
chunk_size,
-                                                                        
chunk_offset
-                                                                       );
-    VIENNACL_CUDA_LAST_ERROR_CHECK("pipelined_cg_csr_vec_mul_blocked_kernel");
-  }
-  else
-  {
-    pipelined_bicgstab_csr_vec_mul_adaptive_kernel<<<256, 
256>>>(viennacl::cuda_arg<unsigned int>(A.handle1()),
-                                                                
viennacl::cuda_arg<unsigned int>(A.handle2()),
-                                                                
viennacl::cuda_arg<unsigned int>(A.handle3()),
-                                                                
viennacl::cuda_arg<NumericT>(A.handle()),
-                                                                
static_cast<unsigned int>(A.blocks1()),
-                                                                
viennacl::cuda_arg(p),
-                                                                
viennacl::cuda_arg(Ap),
-                                                                
viennacl::cuda_arg(r0star),
-                                                                vec_size,
-                                                                
viennacl::cuda_arg(inner_prod_buffer),
-                                                                chunk_size,
-                                                                chunk_offset);
-    
VIENNACL_CUDA_LAST_ERROR_CHECK("pipelined_bicgstab_csr_vec_mul_adaptive_kernel");
-  }
-}
-
-
-//
-// Coordinate Matrix
-//
-
-
-template<typename NumericT>
-__global__ void pipelined_bicgstab_coo_vec_mul_kernel(const unsigned int * 
coords, //(row_index, column_index)
-                                                const NumericT * elements,
-                                                const unsigned int * 
group_boundaries,
-                                                const NumericT * p,
-                                                NumericT * Ap,
-                                                const NumericT * r0star,
-                                                unsigned int size,
-                                                NumericT * inner_prod_buffer,
-                                                unsigned int buffer_size,
-                                                unsigned int buffer_offset)
-{
-  NumericT inner_prod_ApAp = 0;
-  NumericT inner_prod_pAp  = 0;
-  NumericT inner_prod_r0Ap  = 0;
-  __shared__ unsigned int shared_rows[128];
-  __shared__ NumericT inter_results[128];
-
-  uint2 tmp;
-  NumericT val;
-  unsigned int group_start = group_boundaries[blockIdx.x];
-  unsigned int group_end   = group_boundaries[blockIdx.x + 1];
-  unsigned int k_end = (group_end > group_start) ? 1 + (group_end - 
group_start - 1) / blockDim.x : 0;   // -1 in order to have correct behavior if 
group_end - group_start == j * blockDim.x
-
-  unsigned int local_index = 0;
-
-  for (unsigned int k = 0; k < k_end; ++k)
-  {
-    local_index = group_start + k * blockDim.x + threadIdx.x;
-
-    tmp = (local_index < group_end) ? ((const uint2 *)coords)[local_index] : 
::make_uint2(0, 0);
-    val = (local_index < group_end) ? elements[local_index] * p[tmp.y] : 0;
-
-    //check for carry from previous loop run:
-    if (threadIdx.x == 0 && k > 0)
-    {
-      if (tmp.x == shared_rows[blockDim.x-1])
-        val += inter_results[blockDim.x-1];
-      else
-      {
-        NumericT Ap_entry = inter_results[blockDim.x-1];
-        Ap[shared_rows[blockDim.x-1]] = Ap_entry;
-        inner_prod_ApAp += Ap_entry * Ap_entry;
-        inner_prod_pAp  += Ap_entry * p[shared_rows[blockDim.x-1]];
-        inner_prod_r0Ap += r0star[shared_rows[blockDim.x-1]] * Ap_entry;
-      }
-    }
-
-    //segmented parallel reduction begin
-    __syncthreads();
-    shared_rows[threadIdx.x] = tmp.x;
-    inter_results[threadIdx.x] = val;
-    NumericT left = 0;
-    __syncthreads();
-
-    for (unsigned int stride = 1; stride < blockDim.x; stride *= 2)
-    {
-      left = (threadIdx.x >= stride && tmp.x == shared_rows[threadIdx.x - 
stride]) ? inter_results[threadIdx.x - stride] : 0;
-      __syncthreads();
-      inter_results[threadIdx.x] += left;
-      __syncthreads();
-    }
-    //segmented parallel reduction end
-
-    if (local_index < group_end && threadIdx.x < blockDim.x-1 &&
-        shared_rows[threadIdx.x] != shared_rows[threadIdx.x + 1])
-    {
-      NumericT Ap_entry = inter_results[threadIdx.x];
-      Ap[tmp.x] = Ap_entry;
-      inner_prod_ApAp += Ap_entry * Ap_entry;
-      inner_prod_pAp  += Ap_entry * p[tmp.x];
-      inner_prod_r0Ap += r0star[tmp.x] * Ap_entry;
-    }
-
-    __syncthreads();
-  } //for k
-
-  if (local_index + 1 == group_end)
-  {
-    NumericT Ap_entry = inter_results[threadIdx.x];
-    Ap[tmp.x] = Ap_entry;
-    inner_prod_ApAp += Ap_entry * Ap_entry;
-    inner_prod_pAp  += Ap_entry * p[tmp.x];
-    inner_prod_r0Ap += Ap_entry * r0star[tmp.x];
-  }
-
-  ////////// parallel reduction in work group
-  __shared__ NumericT shared_array_ApAp[256];
-  __shared__ NumericT shared_array_pAp[256];
-  __shared__ NumericT shared_array_r0Ap[256];
-  shared_array_ApAp[threadIdx.x] = inner_prod_ApAp;
-  shared_array_pAp[threadIdx.x]  = inner_prod_pAp;
-  shared_array_r0Ap[threadIdx.x] = inner_prod_r0Ap;
-  for (unsigned int stride=blockDim.x/2; stride > 0; stride /= 2)
-  {
-    __syncthreads();
-    if (threadIdx.x < stride)
-    {
-      shared_array_ApAp[threadIdx.x] += shared_array_ApAp[threadIdx.x + 
stride];
-      shared_array_pAp[threadIdx.x]  += shared_array_pAp[threadIdx.x + stride];
-      shared_array_r0Ap[threadIdx.x] += shared_array_r0Ap[threadIdx.x + 
stride];
-    }
-  }
-
-  // write results to result array
-  if (threadIdx.x == 0) {
-    inner_prod_buffer[  buffer_size + blockIdx.x] = shared_array_ApAp[0];
-    inner_prod_buffer[2*buffer_size + blockIdx.x] = shared_array_pAp[0];
-    inner_prod_buffer[buffer_offset + blockIdx.x] = shared_array_r0Ap[0];
-  }
-
-}
-
-
-template<typename NumericT>
-void pipelined_bicgstab_prod(coordinate_matrix<NumericT> const & A,
-                             vector_base<NumericT> const & p,
-                             vector_base<NumericT> & Ap,
-                             vector_base<NumericT> const & r0star,
-                             vector_base<NumericT> & inner_prod_buffer,
-                             vcl_size_t buffer_chunk_size,
-                             vcl_size_t buffer_chunk_offset)
-{
-  unsigned int vec_size     = static_cast<unsigned 
int>(viennacl::traits::size(p));
-  unsigned int chunk_size   = static_cast<unsigned int>(buffer_chunk_size);
-  unsigned int chunk_offset = static_cast<unsigned int>(buffer_chunk_offset);
-
-  Ap.clear();
-
-  pipelined_bicgstab_coo_vec_mul_kernel<<<64, 
128>>>(viennacl::cuda_arg<unsigned int>(A.handle12()),
-                                                      
viennacl::cuda_arg<NumericT>(A.handle()),
-                                                      
viennacl::cuda_arg<unsigned int>(A.handle3()),
-                                                      viennacl::cuda_arg(p),
-                                                      viennacl::cuda_arg(Ap),
-                                                      
viennacl::cuda_arg(r0star),
-                                                      vec_size,
-                                                      
viennacl::cuda_arg(inner_prod_buffer),
-                                                      chunk_size,
-                                                      chunk_offset);
-  VIENNACL_CUDA_LAST_ERROR_CHECK("pipelined_bicgstab_coo_vec_mul_kernel");
-}
-
-
-
-//
-// ELL Matrix
-//
-
-template<typename NumericT>
-__global__ void pipelined_bicgstab_ell_vec_mul_kernel(const unsigned int * 
coords,
-                                                const NumericT * elements,
-                                                unsigned int internal_row_num,
-                                                unsigned int items_per_row,
-                                                const NumericT * p,
-                                                NumericT * Ap,
-                                                const NumericT * r0star,
-                                                unsigned int size,
-                                                NumericT * inner_prod_buffer,
-                                                unsigned int buffer_size,
-                                                unsigned int buffer_offset)
-{
-  NumericT inner_prod_ApAp = 0;
-  NumericT inner_prod_pAp  = 0;
-  NumericT inner_prod_r0Ap  = 0;
-  unsigned int glb_id = blockDim.x * blockIdx.x + threadIdx.x;
-  unsigned int glb_sz = gridDim.x * blockDim.x;
-
-  for (unsigned int row = glb_id; row < size; row += glb_sz)
-  {
-    NumericT sum = 0;
-
-    unsigned int offset = row;
-    for (unsigned int item_id = 0; item_id < items_per_row; item_id++, offset 
+= internal_row_num)
-    {
-      NumericT val = elements[offset];
-      sum += val ? p[coords[offset]] * val : NumericT(0);
-    }
-
-    Ap[row] = sum;
-    inner_prod_ApAp += sum * sum;
-    inner_prod_pAp  += sum * p[row];
-    inner_prod_r0Ap += sum * r0star[row];
-  }
-
-  ////////// parallel reduction in work group
-  __shared__ NumericT shared_array_ApAp[256];
-  __shared__ NumericT shared_array_pAp[256];
-  __shared__ NumericT shared_array_r0Ap[256];
-  shared_array_ApAp[threadIdx.x] = inner_prod_ApAp;
-  shared_array_pAp[threadIdx.x]  = inner_prod_pAp;
-  shared_array_r0Ap[threadIdx.x] = inner_prod_r0Ap;
-  for (unsigned int stride=blockDim.x/2; stride > 0; stride /= 2)
-  {
-    __syncthreads();
-    if (threadIdx.x < stride)
-    {
-      shared_array_ApAp[threadIdx.x] += shared_array_ApAp[threadIdx.x + 
stride];
-      shared_array_pAp[threadIdx.x]  += shared_array_pAp[threadIdx.x + stride];
-      shared_array_r0Ap[threadIdx.x] += shared_array_r0Ap[threadIdx.x + 
stride];
-    }
-  }
-
-  // write results to result array
-  if (threadIdx.x == 0) {
-    inner_prod_buffer[  buffer_size + blockIdx.x] = shared_array_ApAp[0];
-    inner_prod_buffer[2*buffer_size + blockIdx.x] = shared_array_pAp[0];
-    inner_prod_buffer[buffer_offset + blockIdx.x] = shared_array_r0Ap[0];
-  }
-}
-
-
-template<typename NumericT>
-void pipelined_bicgstab_prod(ell_matrix<NumericT> const & A,
-                             vector_base<NumericT> const & p,
-                             vector_base<NumericT> & Ap,
-                             vector_base<NumericT> const & r0star,
-                             vector_base<NumericT> & inner_prod_buffer,
-                             vcl_size_t buffer_chunk_size,
-                             vcl_size_t buffer_chunk_offset)
-{
-  unsigned int vec_size     = static_cast<unsigned 
int>(viennacl::traits::size(p));
-  unsigned int chunk_size   = static_cast<unsigned int>(buffer_chunk_size);
-  unsigned int chunk_offset = static_cast<unsigned int>(buffer_chunk_offset);
-
-  pipelined_bicgstab_ell_vec_mul_kernel<<<256, 
256>>>(viennacl::cuda_arg<unsigned int>(A.handle2()),
-                                                      
viennacl::cuda_arg<NumericT>(A.handle()),
-                                                      static_cast<unsigned 
int>(A.internal_size1()),
-                                                      static_cast<unsigned 
int>(A.maxnnz()),
-                                                      viennacl::cuda_arg(p),
-                                                      viennacl::cuda_arg(Ap),
-                                                      
viennacl::cuda_arg(r0star),
-                                                      vec_size,
-                                                      
viennacl::cuda_arg(inner_prod_buffer),
-                                                      chunk_size,
-                                                      chunk_offset);
-  VIENNACL_CUDA_LAST_ERROR_CHECK("pipelined_bicgstab_ell_vec_mul_kernel");
-}
-
-
-//
-// SELL-C-\sigma Matrix
-//
-
-template<typename NumericT>
-__global__ void pipelined_bicgstab_sliced_ell_vec_mul_kernel(const unsigned 
int * columns_per_block,
-                                                             const unsigned 
int * column_indices,
-                                                             const unsigned 
int * block_start,
-                                                             const NumericT * 
elements,
-                                                             const NumericT * 
p,
-                                                             NumericT * Ap,
-                                                             const NumericT * 
r0star,
-                                                             unsigned int size,
-                                                             unsigned int 
block_size,
-                                                             NumericT * 
inner_prod_buffer,
-                                                             unsigned int 
buffer_size,
-                                                             unsigned int 
buffer_offset)
-{
-  NumericT inner_prod_ApAp = 0;
-  NumericT inner_prod_pAp  = 0;
-  NumericT inner_prod_r0Ap  = 0;
-
-  unsigned int blocks_per_threadblock = blockDim.x / block_size;
-  unsigned int id_in_block = threadIdx.x % block_size;
-  unsigned int num_blocks = (size - 1) / block_size + 1;
-  unsigned int global_warp_count = blocks_per_threadblock * gridDim.x;
-  unsigned int global_warp_id = blocks_per_threadblock * blockIdx.x + 
threadIdx.x / block_size;
-
-  for (unsigned int block_idx = global_warp_id; block_idx < num_blocks; 
block_idx += global_warp_count)
-  {
-    unsigned int row         = block_idx * block_size + id_in_block;
-    unsigned int offset      = block_start[block_idx];
-    unsigned int num_columns = columns_per_block[block_idx];
-
-    NumericT sum = 0;
-    for (unsigned int item_id = 0; item_id < num_columns; item_id++)
-    {
-      unsigned int index = offset + item_id * block_size + id_in_block;
-      NumericT val = elements[index];
-
-      sum += val ? (p[column_indices[index]] * val) : 0;
-    }
-
-    if (row < size)
-    {
-      Ap[row] = sum;
-      inner_prod_ApAp += sum * sum;
-      inner_prod_pAp  += sum * p[row];
-      inner_prod_r0Ap += sum * r0star[row];
-    }
-  }
-
-  ////////// parallel reduction in work group
-  __shared__ NumericT shared_array_ApAp[256];
-  __shared__ NumericT shared_array_pAp[256];
-  __shared__ NumericT shared_array_r0Ap[256];
-  shared_array_ApAp[threadIdx.x] = inner_prod_ApAp;
-  shared_array_pAp[threadIdx.x]  = inner_prod_pAp;
-  shared_array_r0Ap[threadIdx.x] = inner_prod_r0Ap;
-  for (unsigned int stride=blockDim.x/2; stride > 0; stride /= 2)
-  {
-    __syncthreads();
-    if (threadIdx.x < stride)
-    {
-      shared_array_ApAp[threadIdx.x] += shared_array_ApAp[threadIdx.x + 
stride];
-      shared_array_pAp[threadIdx.x]  += shared_array_pAp[threadIdx.x + stride];
-      shared_array_r0Ap[threadIdx.x] += shared_array_r0Ap[threadIdx.x + 
stride];
-    }
-  }
-
-  // write results to result array
-  if (threadIdx.x == 0) {
-    inner_prod_buffer[  buffer_size + blockIdx.x] = shared_array_ApAp[0];
-    inner_prod_buffer[2*buffer_size + blockIdx.x] = shared_array_pAp[0];
-    inner_prod_buffer[buffer_offset + blockIdx.x] = shared_array_r0Ap[0];
-  }
-}
-
-template<typename NumericT>
-void pipelined_bicgstab_prod(sliced_ell_matrix<NumericT> const & A,
-                             vector_base<NumericT> const & p,
-                             vector_base<NumericT> & Ap,
-                             vector_base<NumericT> const & r0star,
-                             vector_base<NumericT> & inner_prod_buffer,
-                             vcl_size_t buffer_chunk_size,
-                             vcl_size_t buffer_chunk_offset)
-{
-  unsigned int vec_size     = static_cast<unsigned 
int>(viennacl::traits::size(p));
-  unsigned int chunk_size   = static_cast<unsigned int>(buffer_chunk_size);
-  unsigned int chunk_offset = static_cast<unsigned int>(buffer_chunk_offset);
-
-  pipelined_bicgstab_sliced_ell_vec_mul_kernel<<<256, 
256>>>(viennacl::cuda_arg<unsigned int>(A.handle1()),
-                                                             
viennacl::cuda_arg<unsigned int>(A.handle2()),
-                                                             
viennacl::cuda_arg<unsigned int>(A.handle3()),
-                                                             
viennacl::cuda_arg<NumericT>(A.handle()),
-                                                             
viennacl::cuda_arg(p),
-                                                             
viennacl::cuda_arg(Ap),
-                                                             
viennacl::cuda_arg(r0star),
-                                                             vec_size,
-                                                             
static_cast<unsigned int>(A.rows_per_block()),
-                                                             
viennacl::cuda_arg(inner_prod_buffer),
-                                                             chunk_size,
-                                                             chunk_offset);
-  
VIENNACL_CUDA_LAST_ERROR_CHECK("pipelined_bicgstab_sliced_ell_vec_mul_kernel");
-}
-
-
-//
-// Hybrid Matrix
-//
-
-
-template<typename NumericT>
-__global__ void pipelined_bicgstab_hyb_vec_mul_kernel(const unsigned int * 
ell_coords,
-                                                      const NumericT * 
ell_elements,
-                                                      const unsigned int * 
csr_rows,
-                                                      const unsigned int * 
csr_cols,
-                                                      const NumericT * 
csr_elements,
-                                                      unsigned int 
internal_row_num,
-                                                      unsigned int 
items_per_row,
-                                                      const NumericT * p,
-                                                      NumericT * Ap,
-                                                      const NumericT * r0star,
-                                                      unsigned int size,
-                                                      NumericT * 
inner_prod_buffer,
-                                                      unsigned int buffer_size,
-                                                      unsigned int 
buffer_offset)
-{
-  NumericT inner_prod_ApAp = 0;
-  NumericT inner_prod_pAp  = 0;
-  NumericT inner_prod_r0Ap  = 0;
-  unsigned int glb_id = blockDim.x * blockIdx.x + threadIdx.x;
-  unsigned int glb_sz = gridDim.x * blockDim.x;
-
-  for (unsigned int row = glb_id; row < size; row += glb_sz)
-  {
-    NumericT sum = 0;
-
-    unsigned int offset = row;
-    for (unsigned int item_id = 0; item_id < items_per_row; item_id++, offset 
+= internal_row_num)
-    {
-      NumericT val = ell_elements[offset];
-
-      sum += val ? p[ell_coords[offset]] * val : NumericT(0);
-    }
-
-    unsigned int col_begin = csr_rows[row];
-    unsigned int col_end   = csr_rows[row + 1];
-
-    for (unsigned int item_id = col_begin; item_id < col_end; item_id++)
-    {
-      sum += p[csr_cols[item_id]] * csr_elements[item_id];
-    }
-
-    Ap[row] = sum;
-    inner_prod_ApAp += sum * sum;
-    inner_prod_pAp  += sum * p[row];
-    inner_prod_r0Ap += sum * r0star[row];
-  }
-
-  ////////// parallel reduction in work group
-  __shared__ NumericT shared_array_ApAp[256];
-  __shared__ NumericT shared_array_pAp[256];
-  __shared__ NumericT shared_array_r0Ap[256];
-  shared_array_ApAp[threadIdx.x] = inner_prod_ApAp;
-  shared_array_pAp[threadIdx.x]  = inner_prod_pAp;
-  shared_array_r0Ap[threadIdx.x] = inner_prod_r0Ap;
-  for (unsigned int stride=blockDim.x/2; stride > 0; stride /= 2)
-  {
-    __syncthreads();
-    if (threadIdx.x < stride)
-    {
-      shared_array_ApAp[threadIdx.x] += shared_array_ApAp[threadIdx.x + 
stride];
-      shared_array_pAp[threadIdx.x]  += shared_array_pAp[threadIdx.x + stride];
-      shared_array_r0Ap[threadIdx.x] += shared_array_r0Ap[threadIdx.x + 
stride];
-    }
-  }
-
-  // write results to result array
-  if (threadIdx.x == 0) {
-    inner_prod_buffer[  buffer_size + blockIdx.x] = shared_array_ApAp[0];
-    inner_prod_buffer[2*buffer_size + blockIdx.x] = shared_array_pAp[0];
-    inner_prod_buffer[buffer_offset + blockIdx.x] = shared_array_r0Ap[0];
-  }
-}
-
-
-
-template<typename NumericT>
-void pipelined_bicgstab_prod(hyb_matrix<NumericT> const & A,
-                             vector_base<NumericT> const & p,
-                             vector_base<NumericT> & Ap,
-                             vector_base<NumericT> const & r0star,
-                             vector_base<NumericT> & inner_prod_buffer,
-                             vcl_size_t buffer_chunk_size,
-                             vcl_size_t buffer_chunk_offset)
-{
-  unsigned int vec_size     = static_cast<unsigned 
int>(viennacl::traits::size(p));
-  unsigned int chunk_size   = static_cast<unsigned int>(buffer_chunk_size);
-  unsigned int chunk_offset = static_cast<unsigned int>(buffer_chunk_offset);
-
-  pipelined_bicgstab_hyb_vec_mul_kernel<<<256, 
256>>>(viennacl::cuda_arg<unsigned int>(A.handle2()),
-                                                      
viennacl::cuda_arg<NumericT>(A.handle()),
-                                                      
viennacl::cuda_arg<unsigned int>(A.handle3()),
-                                                      
viennacl::cuda_arg<unsigned int>(A.handle4()),
-                                                      
viennacl::cuda_arg<NumericT>(A.handle5()),
-                                                      static_cast<unsigned 
int>(A.internal_size1()),
-                                                      static_cast<unsigned 
int>(A.ell_nnz()),
-                                                      viennacl::cuda_arg(p),
-                                                      viennacl::cuda_arg(Ap),
-                                                      
viennacl::cuda_arg(r0star),
-                                                      vec_size,
-                                                      
viennacl::cuda_arg(inner_prod_buffer),
-                                                      chunk_size,
-                                                      chunk_offset);
-  VIENNACL_CUDA_LAST_ERROR_CHECK("pipelined_bicgstab_hyb_vec_mul_kernel");
-}
-
-//////////////////////////////////////////
-
-template <typename T>
-__global__ void pipelined_gmres_normalize_vk_kernel(T * vk,
-                                                    unsigned int vk_offset,
-                                                    T const * residual,
-                                                    T * R_buffer,
-                                                    unsigned int R_offset,
-                                                    T const * 
inner_prod_buffer,
-                                                    unsigned int chunk_size,
-                                                    T * r_dot_vk_buffer,
-                                                    unsigned int chunk_offset,
-                                                    unsigned int size)
-{
-  __shared__ T shared_array[128];
-  T norm_vk = 0;
-
-  // parallel reduction in work group to compute <vk, vk>
-  shared_array[threadIdx.x] = inner_prod_buffer[threadIdx.x + chunk_size];
-  for (unsigned int stride=blockDim.x/2; stride > 0; stride /= 2)
-  {
-    __syncthreads();
-    if (threadIdx.x < stride)
-      shared_array[threadIdx.x] += shared_array[threadIdx.x + stride];
-  }
-
-  // compute alpha from reduced values:
-  __syncthreads();
-  norm_vk = sqrt(shared_array[0]);
-
-  T inner_prod_contrib = 0;
-  for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size; i += 
gridDim.x * blockDim.x) {
-    T value_vk = vk[i + vk_offset] / norm_vk;
-
-    inner_prod_contrib += residual[i] * value_vk;
-
-    vk[i + vk_offset] = value_vk;
-  }
-  __syncthreads();
-
-  // parallel reduction in work group
-  shared_array[threadIdx.x] = inner_prod_contrib;
-  for (unsigned int stride=blockDim.x/2; stride > 0; stride /= 2)
-  {
-    __syncthreads();
-    if (threadIdx.x < stride)
-      shared_array[threadIdx.x] += shared_array[threadIdx.x + stride];
-  }
-
-  // write results of first reduction stage:
-  if (threadIdx.x == 0)
-    r_dot_vk_buffer[blockIdx.x + chunk_offset] = shared_array[0];
-  // store norm:
-  if (blockDim.x * blockIdx.x + threadIdx.x == 0)
-    R_buffer[R_offset] = norm_vk;
-}
-
-/** @brief Performs a vector normalization needed for an efficient pipelined 
GMRES algorithm.
-  *
-  * This routines computes for vectors 'r', 'v_k':
-  *   Second reduction step for ||v_k||
-  *   v_k /= ||v_k||
-  *   First reduction step for <r, v_k>
-  */
-template <typename T>
-void pipelined_gmres_normalize_vk(vector_base<T> & v_k,
-                                  vector_base<T> const & residual,
-                                  vector_base<T> & R_buffer,
-                                  vcl_size_t offset_in_R,
-                                  vector_base<T> const & inner_prod_buffer,
-                                  vector_base<T> & r_dot_vk_buffer,
-                                  vcl_size_t buffer_chunk_size,
-                                  vcl_size_t buffer_chunk_offset)
-{
-  unsigned int vk_offset = viennacl::traits::start(v_k);
-  unsigned int R_offset = offset_in_R;
-  unsigned int chunk_size = buffer_chunk_size;
-  unsigned int chunk_offset = buffer_chunk_offset;
-  unsigned int size = v_k.size();
-
-  pipelined_gmres_normalize_vk_kernel<<<128, 128>>>(viennacl::cuda_arg(v_k),
-                                                    vk_offset,
-                                                    
viennacl::cuda_arg(residual),
-                                                    
viennacl::cuda_arg(R_buffer),
-                                                    R_offset,
-                                                    
viennacl::cuda_arg(inner_prod_buffer),
-                                                    chunk_size,
-                                                    
viennacl::cuda_arg(r_dot_vk_buffer),
-                                                    chunk_offset,
-                                                    size);
-  VIENNACL_CUDA_LAST_ERROR_CHECK("pipelined_gmres_normalize_vk_kernel");
-}
-
-
-
-template <typename T>
-__global__ void pipelined_gmres_gram_schmidt_stage1_kernel(T const * 
krylov_basis,
-                                                           unsigned int size,
-                                                           unsigned int 
internal_size,
-                                                           unsigned int k,
-                                                           T * vi_in_vk_buffer,
-                                                           unsigned int 
chunk_size)
-{
-  __shared__ T shared_array[7*128];
-  T value_vk = 0;
-
-  unsigned int k_base = 0;
-  while (k_base < k)
-  {
-    unsigned int vecs_in_iteration = (k - k_base > 7) ? 7 : (k - k_base);
-
-    for (unsigned int j=0; j<vecs_in_iteration; ++j)
-      shared_array[threadIdx.x + j*chunk_size] = 0;
-
-    for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size; i 
+= gridDim.x * blockDim.x)
-    {
-      value_vk = krylov_basis[i + k * internal_size];
-
-      for (unsigned int j=0; j<vecs_in_iteration; ++j)
-        shared_array[threadIdx.x + j*chunk_size] += value_vk * krylov_basis[i 
+ (k_base + j) * internal_size];
-    }
-
-    // parallel reduction in work group
-    for (unsigned int stride=blockDim.x/2; stride > 0; stride /= 2)
-    {
-      __syncthreads();
-      if (threadIdx.x < stride) {
-        for (unsigned int j=0; j<vecs_in_iteration; ++j)
-          shared_array[threadIdx.x + j*chunk_size] += shared_array[threadIdx.x 
+ j*chunk_size + stride];
-      }
-    }
-
-    // write results to result array
-    if (threadIdx.x == 0)
-      for (unsigned int j=0; j<vecs_in_iteration; ++j)
-        vi_in_vk_buffer[blockIdx.x + (k_base + j) * chunk_size] = 
shared_array[j*chunk_size];
-
-    k_base += vecs_in_iteration;
-  }
-
-}
-
-template <typename T>
-void pipelined_gmres_gram_schmidt_stage1(vector_base<T> const & 
device_krylov_basis,
-                                         vcl_size_t v_k_size,
-                                         vcl_size_t v_k_internal_size,
-                                         vcl_size_t param_k,
-                                         vector_base<T> & vi_in_vk_buffer,
-                                         vcl_size_t buffer_chunk_size)
-{
-  unsigned int chunk_size = buffer_chunk_size;
-  unsigned int size = v_k_size;
-  unsigned int internal_size = v_k_internal_size;
-  unsigned int k = param_k;
-
-  pipelined_gmres_gram_schmidt_stage1_kernel<<<128, 
128>>>(viennacl::cuda_arg(device_krylov_basis),
-                                                           size,
-                                                           internal_size,
-                                                           k,
-                                                           
viennacl::cuda_arg(vi_in_vk_buffer),
-                                                           chunk_size);
-  VIENNACL_CUDA_LAST_ERROR_CHECK("pipelined_gmres_gram_schmidt_stage1_kernel");
-}
-
-
-
-
-template <typename T>
-__global__ void pipelined_gmres_gram_schmidt_stage2_kernel(T * krylov_basis,
-                                                           unsigned int size,
-                                                           unsigned int 
internal_size,
-                                                           unsigned int k,
-                                                           T const * 
vi_in_vk_buffer,
-                                                           unsigned int 
chunk_size,
-                                                           T * R_buffer,
-                                                           unsigned int 
krylov_dim,
-                                                           T * 
inner_prod_buffer)
-{
-  __shared__ T shared_array[7*128];
-  T vk_dot_vk = 0;
-  T value_vk = 0;
-
-  unsigned int k_base = 0;
-  while (k_base < k)
-  {
-    unsigned int vecs_in_iteration = (k - k_base > 7) ? 7 : (k - k_base);
-
-    // parallel reduction in work group for <v_i, v_k>
-    for (unsigned int j=0; j<vecs_in_iteration; ++j)
-      shared_array[threadIdx.x + j*chunk_size] = vi_in_vk_buffer[threadIdx.x + 
(k_base + j) * chunk_size];
-    for (unsigned int stride=blockDim.x/2; stride > 0; stride /= 2)
-    {
-      __syncthreads();
-      if (threadIdx.x < stride) {
-        for (unsigned int j=0; j<vecs_in_iteration; ++j)
-          shared_array[threadIdx.x + j*chunk_size] += shared_array[threadIdx.x 
+ j*chunk_size + stride];
-      }
-    }
-    __syncthreads();
-
-    // v_k -= <v_i, v_k> v_i:
-    for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size; i 
+= gridDim.x * blockDim.x)
-    {
-      value_vk = krylov_basis[i + k * internal_size];
-
-      for (unsigned int j=0; j<vecs_in_iteration; ++j)
-        value_vk -= shared_array[j*chunk_size] * krylov_basis[i + (k_base + j) 
* internal_size];
-      vk_dot_vk += (k_base + vecs_in_iteration == k) ? (value_vk * value_vk) : 
0;
-      krylov_basis[i + k * internal_size] = value_vk;
-    }
-
-    // write to R: (to avoid thread divergence, all threads write the same 
value)
-    if (blockIdx.x == 0)
-      for (unsigned int j=0; j<vecs_in_iteration; ++j)
-        R_buffer[(k_base + j) + k*krylov_dim] = shared_array[j*chunk_size];
-    __syncthreads();
-
-    k_base += vecs_in_iteration;
-  }
-
-  // parallel reduction in work group for <v_k, v_k>
-  shared_array[threadIdx.x] = vk_dot_vk;
-  for (unsigned int stride=blockDim.x/2; stride > 0; stride /= 2)
-  {
-    __syncthreads();
-    if (threadIdx.x < stride)
-      shared_array[threadIdx.x] += shared_array[threadIdx.x + stride];
-  }
-
-  // write results to result array
-  if (threadIdx.x == 0)
-    inner_prod_buffer[chunk_size+blockIdx.x] = shared_array[0];
-}
-
-template <typename T>
-void pipelined_gmres_gram_schmidt_stage2(vector_base<T> & device_krylov_basis,
-                                         vcl_size_t v_k_size,
-                                         vcl_size_t v_k_internal_size,
-                                         vcl_size_t param_k,
-                                         vector_base<T> const & 
vi_in_vk_buffer,
-                                         vector_base<T> & R_buffer,
-                                         vcl_size_t krylov_dim,
-                                         vector_base<T> & inner_prod_buffer,
-                                         vcl_size_t buffer_chunk_size)
-{
-  unsigned int chunk_size = buffer_chunk_size;
-  unsigned int size = v_k_size;
-  unsigned int internal_size = v_k_internal_size;
-  unsigned int k = param_k;
-  unsigned int krylov = krylov_dim;
-
-  pipelined_gmres_gram_schmidt_stage2_kernel<<<128, 
128>>>(viennacl::cuda_arg(device_krylov_basis),
-                                                           size,
-                                                           internal_size,
-                                                           k,
-                                                           
viennacl::cuda_arg(vi_in_vk_buffer),
-                                                           chunk_size,
-                                                           
viennacl::cuda_arg(R_buffer),
-                                                           krylov,
-                                                           
viennacl::cuda_arg(inner_prod_buffer));
-  VIENNACL_CUDA_LAST_ERROR_CHECK("pipelined_gmres_gram_schmidt_stage2_kernel");
-}
-
-
-
-
-template <typename T>
-__global__ void pipelined_gmres_update_result_kernel(T * result,
-                                                     T const * residual,
-                                                     T const * krylov_basis,
-                                                     unsigned int size,
-                                                     unsigned int 
internal_size,
-                                                     T const * coefficients,
-                                                     unsigned int k)
-{
-  for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size; i += 
gridDim.x * blockDim.x)
-  {
-    T value_result = result[i] + coefficients[0] * residual[i];
-
-    for (unsigned int j = 1; j < k; ++j)
-      value_result += coefficients[j] * krylov_basis[i + (j-1)*internal_size];
-
-    result[i] = value_result;
-  }
-}
-
-template <typename T>
-void pipelined_gmres_update_result(vector_base<T> & result,
-                                   vector_base<T> const & residual,
-                                   vector_base<T> const & krylov_basis,
-                                   vcl_size_t v_k_size,
-                                   vcl_size_t v_k_internal_size,
-                                   vector_base<T> const & coefficients,
-                                   vcl_size_t param_k)
-{
-  unsigned int size = v_k_size;
-  unsigned int internal_size = v_k_internal_size;
-  unsigned int k = param_k;
-
-  pipelined_gmres_update_result_kernel<<<128, 
128>>>(viennacl::cuda_arg(result),
-                                                     
viennacl::cuda_arg(residual),
-                                                     
viennacl::cuda_arg(krylov_basis),
-                                                     size,
-                                                     internal_size,
-                                                     
viennacl::cuda_arg(coefficients),
-                                                     k);
-  VIENNACL_CUDA_LAST_ERROR_CHECK("pipelined_gmres_update_result_kernel");
-}
-
-
-
-template <typename NumericT>
-void pipelined_gmres_prod(compressed_matrix<NumericT> const & A,
-                          vector_base<NumericT> const & p,
-                          vector_base<NumericT> & Ap,
-                          vector_base<NumericT> & inner_prod_buffer)
-{
-  unsigned int size = p.size();
-  unsigned int buffer_size_per_vector = static_cast<unsigned 
int>(inner_prod_buffer.size()) / static_cast<unsigned int>(3);
-
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 500
-  if (double(A.nnz()) / double(A.size1()) > 6.4) // less than 10% of threads 
expected to idle
-  {
-    pipelined_cg_csr_vec_mul_blocked_kernel<8,  NumericT><<<256, 256>>>(   // 
experience on a GTX 750 Ti suggests that 8 is a substantially better choice here
-#else
-  if (double(A.nnz()) / double(A.size1()) > 12.0) // less than 25% of threads 
expected to idle
-  {
-    pipelined_cg_csr_vec_mul_blocked_kernel<16, NumericT><<<128, 256>>>(   // 
Fermi and Kepler prefer 16 threads per row (half-warp)
-#endif
-                                                                        
viennacl::cuda_arg<unsigned int>(A.handle1()),
-                                                                        
viennacl::cuda_arg<unsigned int>(A.handle2()),
-                                                                        
viennacl::cuda_arg<NumericT>(A.handle()),
-                                                                        
viennacl::cuda_arg(p) + viennacl::traits::start(p),
-                                                                        
viennacl::cuda_arg(Ap) + viennacl::traits::start(Ap),
-                                                                        size,
-                                                                        
viennacl::cuda_arg(inner_prod_buffer),
-                                                                        
buffer_size_per_vector
-                                                                       );
-    VIENNACL_CUDA_LAST_ERROR_CHECK("pipelined_cg_csr_vec_mul_blocked_kernel");
-  }
-  else
-  {
-    pipelined_cg_csr_vec_mul_adaptive_kernel<<<128, 
256>>>(viennacl::cuda_arg<unsigned int>(A.handle1()),
-                                                           
viennacl::cuda_arg<unsigned int>(A.handle2()),
-                                                           
viennacl::cuda_arg<unsigned int>(A.handle3()),
-                                                           
viennacl::cuda_arg<NumericT>(A.handle()),
-                                                           
static_cast<unsigned int>(A.blocks1()),
-                                                           
viennacl::cuda_arg(p) + viennacl::traits::start(p),
-                                                           
viennacl::cuda_arg(Ap) + viennacl::traits::start(Ap),
-                                                           size,
-                                                           
viennacl::cuda_arg(inner_prod_buffer),
-                                                           
buffer_size_per_vector);
-    VIENNACL_CUDA_LAST_ERROR_CHECK("pipelined_cg_csr_vec_mul_adaptive_kernel");
-  }
-
-}
-
-template <typename T>
-void pipelined_gmres_prod(coordinate_matrix<T> const & A,
-                          vector_base<T> const & p,
-                          vector_base<T> & Ap,
-                          vector_base<T> & inner_prod_buffer)
-{
-  unsigned int size = p.size();
-  unsigned int buffer_size_per_vector = static_cast<unsigned 
int>(inner_prod_buffer.size()) / static_cast<unsigned int>(3);
-
-  Ap.clear();
-
-  pipelined_cg_coo_vec_mul_kernel<<<64, 128>>>(viennacl::cuda_arg<unsigned 
int>(A.handle12()),
-                                                
viennacl::cuda_arg<T>(A.handle()),
-                                                viennacl::cuda_arg<unsigned 
int>(A.handle3()),
-                                                viennacl::cuda_arg(p) + 
viennacl::traits::start(p),
-                                                viennacl::cuda_arg(Ap) + 
viennacl::traits::start(Ap),
-                                                size,
-                                                
viennacl::cuda_arg(inner_prod_buffer),
-                                                buffer_size_per_vector);
-  VIENNACL_CUDA_LAST_ERROR_CHECK("pipelined_cg_coo_vec_mul_kernel");
-}
-
-template <typename T>
-void pipelined_gmres_prod(ell_matrix<T> const & A,
-                          vector_base<T> const & p,
-                          vector_base<T> & Ap,
-                          vector_base<T> & inner_prod_buffer)
-{
-  unsigned int size = p.size();
-  unsigned int buffer_size_per_vector = static_cast<unsigned 
int>(inner_prod_buffer.size()) / static_cast<unsigned int>(3);
-
-  pipelined_cg_ell_vec_mul_kernel<<<128, 256>>>(viennacl::cuda_arg<unsigned 
int>(A.handle2()),
-                                                
viennacl::cuda_arg<T>(A.handle()),
-                                                static_cast<unsigned 
int>(A.internal_size1()),
-                                                static_cast<unsigned 
int>(A.maxnnz()),
-                                                viennacl::cuda_arg(p) + 
viennacl::traits::start(p),
-                                                viennacl::cuda_arg(Ap) + 
viennacl::traits::start(Ap),
-                                                size,
-                                                
viennacl::cuda_arg(inner_prod_buffer),
-                                                buffer_size_per_vector);
-  VIENNACL_CUDA_LAST_ERROR_CHECK("pipelined_cg_ell_vec_mul_kernel");
-}
-
-template <typename T>
-void pipelined_gmres_prod(sliced_ell_matrix<T> const & A,
-                          vector_base<T> const & p,
-                          vector_base<T> & Ap,
-                          vector_base<T> & inner_prod_buffer)
-{
-  unsigned int size = p.size();
-  unsigned int b


<TRUNCATED>

[36/51] [partial] mahout git commit: Revert "(nojira) add native-viennaCL module to codebase. closes apache/mahout#241"

Reply via email to