http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/iterative_operations.hpp ---------------------------------------------------------------------- diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/iterative_operations.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/iterative_operations.hpp new file mode 100644 index 0000000..4330431 --- /dev/null +++ b/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/iterative_operations.hpp @@ -0,0 +1,2049 @@ +#ifndef VIENNACL_LINALG_CUDA_ITERATIVE_OPERATIONS_HPP_ +#define VIENNACL_LINALG_CUDA_ITERATIVE_OPERATIONS_HPP_ + +/* ========================================================================= + Copyright (c) 2010-2016, Institute for Microelectronics, + Institute for Analysis and Scientific Computing, + TU Wien. + Portions of this software are copyright by UChicago Argonne, LLC. + + ----------------- + ViennaCL - The Vienna Computing Library + ----------------- + + Project Head: Karl Rupp [email protected] + + (A list of authors and contributors can be found in the manual) + + License: MIT (X11), see file LICENSE in the base directory +============================================================================= */ + +/** @file viennacl/linalg/cuda/iterative_operations.hpp + @brief Implementations of operations using sparse matrices using CUDA +*/ + +#include "viennacl/forwards.h" +#include "viennacl/scalar.hpp" +#include "viennacl/vector.hpp" +#include "viennacl/tools/tools.hpp" +#include "viennacl/linalg/cuda/common.hpp" + +namespace viennacl +{ +namespace linalg +{ +namespace cuda +{ + +// +// CG vector update: +// + +// cpu scalar +template<typename NumericT> +__global__ void pipelined_cg_vector_kernel(NumericT * result, + NumericT alpha, + NumericT * p, + NumericT * r, + NumericT const * Ap, + NumericT beta, + NumericT * inner_prod_buffer, + unsigned int size) +{ + NumericT inner_prod_contrib = 0; + for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size; i += gridDim.x * blockDim.x) + { + NumericT value_p = p[i]; + NumericT value_r = r[i]; + + result[i] += alpha * value_p; + value_r -= alpha * Ap[i]; + value_p = value_r + beta * value_p; + + p[i] = value_p; + r[i] = value_r; + inner_prod_contrib += value_r * value_r; + } + + // parallel reduction in work group + __shared__ NumericT shared_array[256]; + shared_array[threadIdx.x] = inner_prod_contrib; + for (unsigned int stride=blockDim.x/2; stride > 0; stride /= 2) + { + __syncthreads(); + if (threadIdx.x < stride) + shared_array[threadIdx.x] += shared_array[threadIdx.x + stride]; + } + + // write results to result array + if (threadIdx.x == 0) + inner_prod_buffer[blockIdx.x] = shared_array[0]; +} + + +template<typename NumericT> +void pipelined_cg_vector_update(vector_base<NumericT> & result, + NumericT alpha, + vector_base<NumericT> & p, + vector_base<NumericT> & r, + vector_base<NumericT> const & Ap, + NumericT beta, + vector_base<NumericT> & inner_prod_buffer) +{ + unsigned int size = result.size(); + pipelined_cg_vector_kernel<<<128, 128>>>(viennacl::cuda_arg(result), + alpha, + viennacl::cuda_arg(p), + viennacl::cuda_arg(r), + viennacl::cuda_arg(Ap), + beta, + viennacl::cuda_arg(inner_prod_buffer), + size); + VIENNACL_CUDA_LAST_ERROR_CHECK("pipelined_cg_vector_kernel"); +} + + + + +// +// Compressed matrix +// + + +template<unsigned int SubWarpSizeV, typename NumericT> +__global__ void pipelined_cg_csr_vec_mul_blocked_kernel( + const unsigned int * row_indices, + const unsigned int * column_indices, + const NumericT * elements, + const NumericT * p, + NumericT * Ap, + unsigned int size, + NumericT * inner_prod_buffer, + unsigned int buffer_size) +{ + __shared__ NumericT shared_elements[256]; + NumericT inner_prod_ApAp = 0; + NumericT inner_prod_pAp = 0; + + const unsigned int id_in_row = threadIdx.x % SubWarpSizeV; + const unsigned int block_increment = blockDim.x * ((size - 1) / (gridDim.x * blockDim.x) + 1); + const unsigned int block_start = blockIdx.x * block_increment; + const unsigned int block_stop = min(block_start + block_increment, size); + + for (unsigned int row = block_start + threadIdx.x / SubWarpSizeV; + row < block_stop; + row += blockDim.x / SubWarpSizeV) + { + NumericT dot_prod = NumericT(0); + unsigned int row_end = row_indices[row+1]; + for (unsigned int i = row_indices[row] + id_in_row; i < row_end; i += SubWarpSizeV) + dot_prod += elements[i] * p[column_indices[i]]; + + shared_elements[threadIdx.x] = dot_prod; + if (1 < SubWarpSizeV) shared_elements[threadIdx.x] += shared_elements[threadIdx.x ^ 1]; + if (2 < SubWarpSizeV) shared_elements[threadIdx.x] += shared_elements[threadIdx.x ^ 2]; + if (4 < SubWarpSizeV) shared_elements[threadIdx.x] += shared_elements[threadIdx.x ^ 4]; + if (8 < SubWarpSizeV) shared_elements[threadIdx.x] += shared_elements[threadIdx.x ^ 8]; + if (16 < SubWarpSizeV) shared_elements[threadIdx.x] += shared_elements[threadIdx.x ^ 16]; + + if (id_in_row == 0) + { + Ap[row] = shared_elements[threadIdx.x]; + inner_prod_ApAp += shared_elements[threadIdx.x] * shared_elements[threadIdx.x]; + inner_prod_pAp += p[row] * shared_elements[threadIdx.x]; + } + } + + ////////// parallel reduction in work group + __shared__ NumericT shared_array_ApAp[256]; + __shared__ NumericT shared_array_pAp[256]; + shared_array_ApAp[threadIdx.x] = inner_prod_ApAp; + shared_array_pAp[threadIdx.x] = inner_prod_pAp; + for (unsigned int stride=blockDim.x/2; stride > 0; stride /= 2) + { + __syncthreads(); + if (threadIdx.x < stride) + { + shared_array_ApAp[threadIdx.x] += shared_array_ApAp[threadIdx.x + stride]; + shared_array_pAp[threadIdx.x] += shared_array_pAp[threadIdx.x + stride]; + } + } + + // write results to result array + if (threadIdx.x == 0) { + inner_prod_buffer[ buffer_size + blockIdx.x] = shared_array_ApAp[0]; + inner_prod_buffer[2*buffer_size + blockIdx.x] = shared_array_pAp[0]; + } + +} + +template<typename NumericT> +__global__ void pipelined_cg_csr_vec_mul_adaptive_kernel( + const unsigned int * row_indices, + const unsigned int * column_indices, + const unsigned int * row_blocks, + const NumericT * elements, + unsigned int num_blocks, + const NumericT * p, + NumericT * Ap, + unsigned int size, + NumericT * inner_prod_buffer, + unsigned int buffer_size) +{ + NumericT inner_prod_ApAp = 0; + NumericT inner_prod_pAp = 0; + + __shared__ NumericT shared_elements[1024]; + + for (unsigned int block_id = blockIdx.x; block_id < num_blocks; block_id += gridDim.x) + { + unsigned int row_start = row_blocks[block_id]; + unsigned int row_stop = row_blocks[block_id + 1]; + unsigned int element_start = row_indices[row_start]; + unsigned int element_stop = row_indices[row_stop]; + unsigned int rows_to_process = row_stop - row_start; + + if (rows_to_process > 1) // CSR stream with one thread per row + { + // load to shared buffer: + for (unsigned int i = element_start + threadIdx.x; i < element_stop; i += blockDim.x) + shared_elements[i - element_start] = elements[i] * p[column_indices[i]]; + + __syncthreads(); + + // use one thread per row to sum: + for (unsigned int row = row_start + threadIdx.x; row < row_stop; row += blockDim.x) + { + NumericT dot_prod = 0; + unsigned int thread_row_start = row_indices[row] - element_start; + unsigned int thread_row_stop = row_indices[row + 1] - element_start; + for (unsigned int i = thread_row_start; i < thread_row_stop; ++i) + dot_prod += shared_elements[i]; + Ap[row] = dot_prod; + inner_prod_ApAp += dot_prod * dot_prod; + inner_prod_pAp += p[row] * dot_prod; + } + } + // TODO here: Consider CSR vector for two to four rows (cf. OpenCL implementation. Experience on Fermi suggests that this may not be necessary) + else // CSR vector for a single row + { + // load and sum to shared buffer: + shared_elements[threadIdx.x] = 0; + for (unsigned int i = element_start + threadIdx.x; i < element_stop; i += blockDim.x) + shared_elements[threadIdx.x] += elements[i] * p[column_indices[i]]; + + // reduction to obtain final result + for (unsigned int stride = blockDim.x/2; stride > 0; stride /= 2) + { + __syncthreads(); + if (threadIdx.x < stride) + shared_elements[threadIdx.x] += shared_elements[threadIdx.x+stride]; + } + + if (threadIdx.x == 0) + { + Ap[row_start] = shared_elements[0]; + inner_prod_ApAp += shared_elements[0] * shared_elements[0]; + inner_prod_pAp += p[row_start] * shared_elements[0]; + } + } + + __syncthreads(); // avoid race conditions + } + + ////////// parallel reduction in work group + __shared__ NumericT shared_array_ApAp[256]; + __shared__ NumericT shared_array_pAp[256]; + shared_array_ApAp[threadIdx.x] = inner_prod_ApAp; + shared_array_pAp[threadIdx.x] = inner_prod_pAp; + for (unsigned int stride=blockDim.x/2; stride > 0; stride /= 2) + { + __syncthreads(); + if (threadIdx.x < stride) + { + shared_array_ApAp[threadIdx.x] += shared_array_ApAp[threadIdx.x + stride]; + shared_array_pAp[threadIdx.x] += shared_array_pAp[threadIdx.x + stride]; + } + } + + // write results to result array + if (threadIdx.x == 0) { + inner_prod_buffer[ buffer_size + blockIdx.x] = shared_array_ApAp[0]; + inner_prod_buffer[2*buffer_size + blockIdx.x] = shared_array_pAp[0]; + } +} + + + + +template<typename NumericT> +void pipelined_cg_prod(compressed_matrix<NumericT> const & A, + vector_base<NumericT> const & p, + vector_base<NumericT> & Ap, + vector_base<NumericT> & inner_prod_buffer) +{ + unsigned int size = p.size(); + unsigned int buffer_size_per_vector = static_cast<unsigned int>(inner_prod_buffer.size()) / static_cast<unsigned int>(3); + +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 500 + if (double(A.nnz()) / double(A.size1()) > 6.4) // less than 10% of threads expected to idle + { + pipelined_cg_csr_vec_mul_blocked_kernel<8, NumericT><<<256, 256>>>( // experience on a GTX 750 Ti suggests that 8 is a substantially better choice here +#else + if (double(A.nnz()) / double(A.size1()) > 12.0) // less than 25% of threads expected to idle + { + pipelined_cg_csr_vec_mul_blocked_kernel<16, NumericT><<<256, 256>>>( // Fermi and Kepler prefer 16 threads per row (half-warp) +#endif + viennacl::cuda_arg<unsigned int>(A.handle1()), + viennacl::cuda_arg<unsigned int>(A.handle2()), + viennacl::cuda_arg<NumericT>(A.handle()), + viennacl::cuda_arg(p), + viennacl::cuda_arg(Ap), + size, + viennacl::cuda_arg(inner_prod_buffer), + buffer_size_per_vector + ); + VIENNACL_CUDA_LAST_ERROR_CHECK("pipelined_cg_csr_vec_mul_blocked_kernel"); + } + else + { + pipelined_cg_csr_vec_mul_adaptive_kernel<<<256, 256>>>(viennacl::cuda_arg<unsigned int>(A.handle1()), + viennacl::cuda_arg<unsigned int>(A.handle2()), + viennacl::cuda_arg<unsigned int>(A.handle3()), + viennacl::cuda_arg<NumericT>(A.handle()), + static_cast<unsigned int>(A.blocks1()), + viennacl::cuda_arg(p), + viennacl::cuda_arg(Ap), + size, + viennacl::cuda_arg(inner_prod_buffer), + buffer_size_per_vector); + VIENNACL_CUDA_LAST_ERROR_CHECK("pipelined_cg_csr_vec_mul_kernel"); + } +} + + +// +// Coordinate Matrix +// + + +template<typename NumericT> +__global__ void pipelined_cg_coo_vec_mul_kernel(const unsigned int * coords, //(row_index, column_index) + const NumericT * elements, + const unsigned int * group_boundaries, + const NumericT * p, + NumericT * Ap, + unsigned int size, + NumericT * inner_prod_buffer, + unsigned int buffer_size) +{ + NumericT inner_prod_ApAp = 0; + NumericT inner_prod_pAp = 0; + __shared__ unsigned int shared_rows[128]; + __shared__ NumericT inter_results[128]; + + uint2 tmp; + NumericT val; + unsigned int group_start = group_boundaries[blockIdx.x]; + unsigned int group_end = group_boundaries[blockIdx.x + 1]; + unsigned int k_end = (group_end > group_start) ? 1 + (group_end - group_start - 1) / blockDim.x : 0; // -1 in order to have correct behavior if group_end - group_start == j * blockDim.x + + unsigned int local_index = 0; + + for (unsigned int k = 0; k < k_end; ++k) + { + local_index = group_start + k * blockDim.x + threadIdx.x; + + tmp = (local_index < group_end) ? ((const uint2 *)coords)[local_index] : ::make_uint2(0, 0); + val = (local_index < group_end) ? elements[local_index] * p[tmp.y] : 0; + + //check for carry from previous loop run: + if (threadIdx.x == 0 && k > 0) + { + if (tmp.x == shared_rows[blockDim.x-1]) + val += inter_results[blockDim.x-1]; + else + { + NumericT Ap_entry = inter_results[blockDim.x-1]; + Ap[shared_rows[blockDim.x-1]] = Ap_entry; + inner_prod_ApAp += Ap_entry * Ap_entry; + inner_prod_pAp += Ap_entry * p[shared_rows[blockDim.x-1]]; + } + } + + //segmented parallel reduction begin + __syncthreads(); + shared_rows[threadIdx.x] = tmp.x; + inter_results[threadIdx.x] = val; + NumericT left = 0; + __syncthreads(); + + for (unsigned int stride = 1; stride < blockDim.x; stride *= 2) + { + left = (threadIdx.x >= stride && tmp.x == shared_rows[threadIdx.x - stride]) ? inter_results[threadIdx.x - stride] : 0; + __syncthreads(); + inter_results[threadIdx.x] += left; + __syncthreads(); + } + //segmented parallel reduction end + + if (local_index < group_end && threadIdx.x < blockDim.x-1 && + shared_rows[threadIdx.x] != shared_rows[threadIdx.x + 1]) + { + NumericT Ap_entry = inter_results[threadIdx.x]; + Ap[tmp.x] = Ap_entry; + inner_prod_ApAp += Ap_entry * Ap_entry; + inner_prod_pAp += Ap_entry * p[tmp.x]; + } + + __syncthreads(); + } //for k + + if (local_index + 1 == group_end) + { + NumericT Ap_entry = inter_results[threadIdx.x]; + Ap[tmp.x] = Ap_entry; + inner_prod_ApAp += Ap_entry * Ap_entry; + inner_prod_pAp += Ap_entry * p[tmp.x]; + } + + ////////// parallel reduction in work group + __shared__ NumericT shared_array_ApAp[256]; + __shared__ NumericT shared_array_pAp[256]; + shared_array_ApAp[threadIdx.x] = inner_prod_ApAp; + shared_array_pAp[threadIdx.x] = inner_prod_pAp; + for (unsigned int stride=blockDim.x/2; stride > 0; stride /= 2) + { + __syncthreads(); + if (threadIdx.x < stride) + { + shared_array_ApAp[threadIdx.x] += shared_array_ApAp[threadIdx.x + stride]; + shared_array_pAp[threadIdx.x] += shared_array_pAp[threadIdx.x + stride]; + } + } + + // write results to result array + if (threadIdx.x == 0) { + inner_prod_buffer[ buffer_size + blockIdx.x] = shared_array_ApAp[0]; + inner_prod_buffer[2*buffer_size + blockIdx.x] = shared_array_pAp[0]; + } + +} + + +template<typename NumericT> +void pipelined_cg_prod(coordinate_matrix<NumericT> const & A, + vector_base<NumericT> const & p, + vector_base<NumericT> & Ap, + vector_base<NumericT> & inner_prod_buffer) +{ + unsigned int size = p.size(); + unsigned int buffer_size_per_vector = static_cast<unsigned int>(inner_prod_buffer.size()) / static_cast<unsigned int>(3); + + Ap.clear(); + + pipelined_cg_coo_vec_mul_kernel<<<64, 128>>>(viennacl::cuda_arg<unsigned int>(A.handle12()), + viennacl::cuda_arg<NumericT>(A.handle()), + viennacl::cuda_arg<unsigned int>(A.handle3()), + viennacl::cuda_arg(p), + viennacl::cuda_arg(Ap), + size, + viennacl::cuda_arg(inner_prod_buffer), + buffer_size_per_vector); + VIENNACL_CUDA_LAST_ERROR_CHECK("pipelined_cg_coo_vec_mul_kernel"); +} + + + +// +// ELL Matrix +// + +template<typename NumericT> +__global__ void pipelined_cg_ell_vec_mul_kernel(const unsigned int * coords, + const NumericT * elements, + unsigned int internal_row_num, + unsigned int items_per_row, + const NumericT * p, + NumericT * Ap, + unsigned int size, + NumericT * inner_prod_buffer, + unsigned int buffer_size) +{ + NumericT inner_prod_ApAp = 0; + NumericT inner_prod_pAp = 0; + unsigned int glb_id = blockDim.x * blockIdx.x + threadIdx.x; + unsigned int glb_sz = gridDim.x * blockDim.x; + + for (unsigned int row = glb_id; row < size; row += glb_sz) + { + NumericT sum = 0; + + unsigned int offset = row; + for (unsigned int item_id = 0; item_id < items_per_row; item_id++, offset += internal_row_num) + { + NumericT val = elements[offset]; + sum += val ? p[coords[offset]] * val : NumericT(0); + } + + Ap[row] = sum; + inner_prod_ApAp += sum * sum; + inner_prod_pAp += sum * p[row]; + } + + ////////// parallel reduction in work group + __shared__ NumericT shared_array_ApAp[256]; + __shared__ NumericT shared_array_pAp[256]; + shared_array_ApAp[threadIdx.x] = inner_prod_ApAp; + shared_array_pAp[threadIdx.x] = inner_prod_pAp; + for (unsigned int stride=blockDim.x/2; stride > 0; stride /= 2) + { + __syncthreads(); + if (threadIdx.x < stride) + { + shared_array_ApAp[threadIdx.x] += shared_array_ApAp[threadIdx.x + stride]; + shared_array_pAp[threadIdx.x] += shared_array_pAp[threadIdx.x + stride]; + } + } + + // write results to result array + if (threadIdx.x == 0) { + inner_prod_buffer[ buffer_size + blockIdx.x] = shared_array_ApAp[0]; + inner_prod_buffer[2*buffer_size + blockIdx.x] = shared_array_pAp[0]; + } +} + + +template<typename NumericT> +void pipelined_cg_prod(ell_matrix<NumericT> const & A, + vector_base<NumericT> const & p, + vector_base<NumericT> & Ap, + vector_base<NumericT> & inner_prod_buffer) +{ + unsigned int size = p.size(); + unsigned int buffer_size_per_vector = static_cast<unsigned int>(inner_prod_buffer.size()) / static_cast<unsigned int>(3); + + pipelined_cg_ell_vec_mul_kernel<<<256, 256>>>(viennacl::cuda_arg<unsigned int>(A.handle2()), + viennacl::cuda_arg<NumericT>(A.handle()), + static_cast<unsigned int>(A.internal_size1()), + static_cast<unsigned int>(A.maxnnz()), + viennacl::cuda_arg(p), + viennacl::cuda_arg(Ap), + size, + viennacl::cuda_arg(inner_prod_buffer), + buffer_size_per_vector); + VIENNACL_CUDA_LAST_ERROR_CHECK("pipelined_cg_ell_vec_mul_kernel"); +} + + +// +// SELL-C-\sigma Matrix +// + +template<typename NumericT> +__global__ void pipelined_cg_sliced_ell_vec_mul_kernel(const unsigned int * columns_per_block, + const unsigned int * column_indices, + const unsigned int * block_start, + const NumericT * elements, + const NumericT * p, + NumericT * Ap, + unsigned int size, + unsigned int block_size, + NumericT * inner_prod_buffer, + unsigned int buffer_size) +{ + NumericT inner_prod_ApAp = 0; + NumericT inner_prod_pAp = 0; + + unsigned int blocks_per_threadblock = blockDim.x / block_size; + unsigned int id_in_block = threadIdx.x % block_size; + unsigned int num_blocks = (size - 1) / block_size + 1; + unsigned int global_warp_count = blocks_per_threadblock * gridDim.x; + unsigned int global_warp_id = blocks_per_threadblock * blockIdx.x + threadIdx.x / block_size; + + for (unsigned int block_idx = global_warp_id; block_idx < num_blocks; block_idx += global_warp_count) + { + unsigned int row = block_idx * block_size + id_in_block; + unsigned int offset = block_start[block_idx]; + unsigned int num_columns = columns_per_block[block_idx]; + + NumericT sum = 0; + for (unsigned int item_id = 0; item_id < num_columns; item_id++) + { + unsigned int index = offset + item_id * block_size + id_in_block; + NumericT val = elements[index]; + + sum += val ? (p[column_indices[index]] * val) : 0; + } + + if (row < size) + { + Ap[row] = sum; + inner_prod_ApAp += sum * sum; + inner_prod_pAp += sum * p[row]; + } + } + + ////////// parallel reduction in work group + __shared__ NumericT shared_array_ApAp[256]; + __shared__ NumericT shared_array_pAp[256]; + shared_array_ApAp[threadIdx.x] = inner_prod_ApAp; + shared_array_pAp[threadIdx.x] = inner_prod_pAp; + for (unsigned int stride=blockDim.x/2; stride > 0; stride /= 2) + { + __syncthreads(); + if (threadIdx.x < stride) + { + shared_array_ApAp[threadIdx.x] += shared_array_ApAp[threadIdx.x + stride]; + shared_array_pAp[threadIdx.x] += shared_array_pAp[threadIdx.x + stride]; + } + } + + // write results to result array + if (threadIdx.x == 0) { + inner_prod_buffer[ buffer_size + blockIdx.x] = shared_array_ApAp[0]; + inner_prod_buffer[2*buffer_size + blockIdx.x] = shared_array_pAp[0]; + } +} + +template<typename NumericT> +void pipelined_cg_prod(sliced_ell_matrix<NumericT> const & A, + vector_base<NumericT> const & p, + vector_base<NumericT> & Ap, + vector_base<NumericT> & inner_prod_buffer) +{ + unsigned int size = p.size(); + unsigned int buffer_size_per_vector = static_cast<unsigned int>(inner_prod_buffer.size()) / static_cast<unsigned int>(3); + + pipelined_cg_sliced_ell_vec_mul_kernel<<<256, 256>>>(viennacl::cuda_arg<unsigned int>(A.handle1()), + viennacl::cuda_arg<unsigned int>(A.handle2()), + viennacl::cuda_arg<unsigned int>(A.handle3()), + viennacl::cuda_arg<NumericT>(A.handle()), + viennacl::cuda_arg(p), + viennacl::cuda_arg(Ap), + size, + static_cast<unsigned int>(A.rows_per_block()), + viennacl::cuda_arg(inner_prod_buffer), + buffer_size_per_vector); + VIENNACL_CUDA_LAST_ERROR_CHECK("pipelined_cg_sliced_ell_vec_mul_kernel"); +} + + +// +// Hybrid Matrix +// + + +template<typename NumericT> +__global__ void pipelined_cg_hyb_vec_mul_kernel(const unsigned int * ell_coords, + const NumericT * ell_elements, + const unsigned int * csr_rows, + const unsigned int * csr_cols, + const NumericT * csr_elements, + unsigned int internal_row_num, + unsigned int items_per_row, + const NumericT * p, + NumericT * Ap, + unsigned int size, + NumericT * inner_prod_buffer, + unsigned int buffer_size) +{ + NumericT inner_prod_ApAp = 0; + NumericT inner_prod_pAp = 0; + unsigned int glb_id = blockDim.x * blockIdx.x + threadIdx.x; + unsigned int glb_sz = gridDim.x * blockDim.x; + + for (unsigned int row = glb_id; row < size; row += glb_sz) + { + NumericT sum = 0; + + unsigned int offset = row; + for (unsigned int item_id = 0; item_id < items_per_row; item_id++, offset += internal_row_num) + { + NumericT val = ell_elements[offset]; + + sum += val ? p[ell_coords[offset]] * val : NumericT(0); + } + + unsigned int col_begin = csr_rows[row]; + unsigned int col_end = csr_rows[row + 1]; + + for (unsigned int item_id = col_begin; item_id < col_end; item_id++) + { + sum += p[csr_cols[item_id]] * csr_elements[item_id]; + } + + Ap[row] = sum; + inner_prod_ApAp += sum * sum; + inner_prod_pAp += sum * p[row]; + } + + ////////// parallel reduction in work group + __shared__ NumericT shared_array_ApAp[256]; + __shared__ NumericT shared_array_pAp[256]; + shared_array_ApAp[threadIdx.x] = inner_prod_ApAp; + shared_array_pAp[threadIdx.x] = inner_prod_pAp; + for (unsigned int stride=blockDim.x/2; stride > 0; stride /= 2) + { + __syncthreads(); + if (threadIdx.x < stride) + { + shared_array_ApAp[threadIdx.x] += shared_array_ApAp[threadIdx.x + stride]; + shared_array_pAp[threadIdx.x] += shared_array_pAp[threadIdx.x + stride]; + } + } + + // write results to result array + if (threadIdx.x == 0) { + inner_prod_buffer[ buffer_size + blockIdx.x] = shared_array_ApAp[0]; + inner_prod_buffer[2*buffer_size + blockIdx.x] = shared_array_pAp[0]; + } +} + + + +template<typename NumericT> +void pipelined_cg_prod(hyb_matrix<NumericT> const & A, + vector_base<NumericT> const & p, + vector_base<NumericT> & Ap, + vector_base<NumericT> & inner_prod_buffer) +{ + unsigned int size = p.size(); + unsigned int buffer_size_per_vector = static_cast<unsigned int>(inner_prod_buffer.size()) / static_cast<unsigned int>(3); + + pipelined_cg_hyb_vec_mul_kernel<<<256, 256>>>(viennacl::cuda_arg<unsigned int>(A.handle2()), + viennacl::cuda_arg<NumericT>(A.handle()), + viennacl::cuda_arg<unsigned int>(A.handle3()), + viennacl::cuda_arg<unsigned int>(A.handle4()), + viennacl::cuda_arg<NumericT>(A.handle5()), + static_cast<unsigned int>(A.internal_size1()), + static_cast<unsigned int>(A.ell_nnz()), + viennacl::cuda_arg(p), + viennacl::cuda_arg(Ap), + size, + viennacl::cuda_arg(inner_prod_buffer), + buffer_size_per_vector); + VIENNACL_CUDA_LAST_ERROR_CHECK("pipelined_cg_hyb_vec_mul_kernel"); +} + + + +///////////////////////////////////// + +template<typename NumericT> +__global__ void pipelined_bicgstab_update_s_kernel(NumericT * s, + NumericT const * residual, + NumericT const * Ap, + unsigned int size, + NumericT * inner_prod_buffer, + unsigned int chunk_size, + unsigned int chunk_offset) +{ + NumericT alpha = 0; + + // parallel reduction in work group to compute <r, r0> / <Ap, r0> + __shared__ NumericT shared_array[256]; + __shared__ NumericT shared_array_Ap_in_r0[256]; + + shared_array[threadIdx.x] = inner_prod_buffer[threadIdx.x]; + shared_array_Ap_in_r0[threadIdx.x] = inner_prod_buffer[threadIdx.x + 3 * chunk_size]; + for (unsigned int stride=blockDim.x/2; stride > 0; stride /= 2) + { + __syncthreads(); + if (threadIdx.x < stride) { + shared_array[threadIdx.x] += shared_array[threadIdx.x + stride]; + shared_array_Ap_in_r0[threadIdx.x] += shared_array_Ap_in_r0[threadIdx.x + stride]; + } + } + + // compute alpha from reduced values: + __syncthreads(); + alpha = shared_array[0] / shared_array_Ap_in_r0[0]; + + // run vector update and compute first stage of <s, s> + NumericT inner_prod_contrib = 0; + for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size; i += gridDim.x * blockDim.x) + { + NumericT value_s = s[i]; + + value_s = residual[i] - alpha * Ap[i]; + inner_prod_contrib += value_s * value_s; + + s[i] = value_s; + } + __syncthreads(); + + // parallel reduction in work group + shared_array[threadIdx.x] = inner_prod_contrib; + for (unsigned int stride=blockDim.x/2; stride > 0; stride /= 2) + { + __syncthreads(); + if (threadIdx.x < stride) + shared_array[threadIdx.x] += shared_array[threadIdx.x + stride]; + } + + // write results to inner_prod_buffer + if (threadIdx.x == 0) + inner_prod_buffer[blockIdx.x + chunk_offset] = shared_array[0]; +} + +template<typename NumericT> +void pipelined_bicgstab_update_s(vector_base<NumericT> & s, + vector_base<NumericT> & r, + vector_base<NumericT> const & Ap, + vector_base<NumericT> & inner_prod_buffer, + vcl_size_t buffer_chunk_size, + vcl_size_t buffer_chunk_offset) +{ + unsigned int size = static_cast<unsigned int>(s.size()); + unsigned int chunk_size = static_cast<unsigned int>(buffer_chunk_size); + unsigned int chunk_offset = static_cast<unsigned int>(buffer_chunk_offset); + + pipelined_bicgstab_update_s_kernel<<<256, 256>>>(viennacl::cuda_arg(s), + viennacl::cuda_arg(r), + viennacl::cuda_arg(Ap), + size, + viennacl::cuda_arg(inner_prod_buffer), + chunk_size, + chunk_offset); + VIENNACL_CUDA_LAST_ERROR_CHECK("pipelined_bicgstab_update_s_kernel"); +} + +template<typename NumericT> +__global__ void pipelined_bicgstab_vector_kernel(NumericT * result, + NumericT alpha, + NumericT * p, + NumericT omega, + NumericT const * s, + NumericT * residual, + NumericT const * As, + NumericT beta, + NumericT const * Ap, + NumericT const * r0star, + NumericT * inner_prod_buffer, + unsigned int size) +{ + NumericT inner_prod_r_r0star = 0; + for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size; i += gridDim.x * blockDim.x) + { + NumericT value_result = result[i]; + NumericT value_p = p[i]; + NumericT value_s = s[i]; + NumericT value_residual = residual[i]; + NumericT value_As = As[i]; + NumericT value_Ap = Ap[i]; + NumericT value_r0star = r0star[i]; + + value_result += alpha * value_p + omega * value_s; + value_residual = value_s - omega * value_As; + value_p = value_residual + beta * (value_p - omega * value_Ap); + + result[i] = value_result; + residual[i] = value_residual; + p[i] = value_p; + inner_prod_r_r0star += value_residual * value_r0star; + } + + // parallel reduction in work group + __shared__ NumericT shared_array[256]; + shared_array[threadIdx.x] = inner_prod_r_r0star; + for (unsigned int stride=blockDim.x/2; stride > 0; stride /= 2) + { + __syncthreads(); + if (threadIdx.x < stride) + shared_array[threadIdx.x] += shared_array[threadIdx.x + stride]; + } + + // write results to result array + if (threadIdx.x == 0) + inner_prod_buffer[blockIdx.x] = shared_array[0]; +} + + +template<typename NumericT> +void pipelined_bicgstab_vector_update(vector_base<NumericT> & result, NumericT alpha, vector_base<NumericT> & p, NumericT omega, vector_base<NumericT> const & s, + vector_base<NumericT> & residual, vector_base<NumericT> const & As, + NumericT beta, vector_base<NumericT> const & Ap, + vector_base<NumericT> const & r0star, + vector_base<NumericT> & inner_prod_buffer, vcl_size_t buffer_chunk_size) +{ + (void)buffer_chunk_size; + unsigned int size = static_cast<unsigned int>(result.size()); + + pipelined_bicgstab_vector_kernel<<<256, 256>>>(viennacl::cuda_arg(result), + alpha, + viennacl::cuda_arg(p), + omega, + viennacl::cuda_arg(s), + viennacl::cuda_arg(residual), + viennacl::cuda_arg(As), + beta, + viennacl::cuda_arg(Ap), + viennacl::cuda_arg(r0star), + viennacl::cuda_arg(inner_prod_buffer), + size); + VIENNACL_CUDA_LAST_ERROR_CHECK("pipelined_bicgstab_vector_kernel"); +} + + + +// +// Compressed matrix +// + + +template<unsigned int SubWarpSizeV, typename NumericT> +__global__ void pipelined_bicgstab_csr_vec_mul_blocked_kernel( + const unsigned int * row_indices, + const unsigned int * column_indices, + const NumericT * elements, + const NumericT * p, + NumericT * Ap, + const NumericT * r0star, + unsigned int size, + NumericT * inner_prod_buffer, + unsigned int buffer_size, + unsigned int buffer_offset) +{ + __shared__ NumericT shared_elements[256]; + NumericT inner_prod_ApAp = 0; + NumericT inner_prod_pAp = 0; + NumericT inner_prod_r0Ap = 0; + + const unsigned int id_in_row = threadIdx.x % SubWarpSizeV; + const unsigned int block_increment = blockDim.x * ((size - 1) / (gridDim.x * blockDim.x) + 1); + const unsigned int block_start = blockIdx.x * block_increment; + const unsigned int block_stop = min(block_start + block_increment, size); + + for (unsigned int row = block_start + threadIdx.x / SubWarpSizeV; + row < block_stop; + row += blockDim.x / SubWarpSizeV) + { + NumericT dot_prod = NumericT(0); + unsigned int row_end = row_indices[row+1]; + for (unsigned int i = row_indices[row] + id_in_row; i < row_end; i += SubWarpSizeV) + dot_prod += elements[i] * p[column_indices[i]]; + + shared_elements[threadIdx.x] = dot_prod; + if (1 < SubWarpSizeV) shared_elements[threadIdx.x] += shared_elements[threadIdx.x ^ 1]; + if (2 < SubWarpSizeV) shared_elements[threadIdx.x] += shared_elements[threadIdx.x ^ 2]; + if (4 < SubWarpSizeV) shared_elements[threadIdx.x] += shared_elements[threadIdx.x ^ 4]; + if (8 < SubWarpSizeV) shared_elements[threadIdx.x] += shared_elements[threadIdx.x ^ 8]; + if (16 < SubWarpSizeV) shared_elements[threadIdx.x] += shared_elements[threadIdx.x ^ 16]; + + if (id_in_row == 0) + { + Ap[row] = shared_elements[threadIdx.x]; + inner_prod_ApAp += shared_elements[threadIdx.x] * shared_elements[threadIdx.x]; + inner_prod_pAp += p[row] * shared_elements[threadIdx.x]; + inner_prod_r0Ap += r0star[row] * shared_elements[threadIdx.x]; + } + } + + ////////// parallel reduction in work group + __shared__ NumericT shared_array_ApAp[256]; + __shared__ NumericT shared_array_pAp[256]; + __shared__ NumericT shared_array_r0Ap[256]; + shared_array_ApAp[threadIdx.x] = inner_prod_ApAp; + shared_array_pAp[threadIdx.x] = inner_prod_pAp; + shared_array_r0Ap[threadIdx.x] = inner_prod_r0Ap; + for (unsigned int stride=blockDim.x/2; stride > 0; stride /= 2) + { + __syncthreads(); + if (threadIdx.x < stride) + { + shared_array_ApAp[threadIdx.x] += shared_array_ApAp[threadIdx.x + stride]; + shared_array_pAp[threadIdx.x] += shared_array_pAp[threadIdx.x + stride]; + shared_array_r0Ap[threadIdx.x] += shared_array_r0Ap[threadIdx.x + stride]; + } + } + + // write results to result array + if (threadIdx.x == 0) { + inner_prod_buffer[ buffer_size + blockIdx.x] = shared_array_ApAp[0]; + inner_prod_buffer[2*buffer_size + blockIdx.x] = shared_array_pAp[0]; + inner_prod_buffer[buffer_offset + blockIdx.x] = shared_array_r0Ap[0]; + } + +} + + +template<typename NumericT> +__global__ void pipelined_bicgstab_csr_vec_mul_adaptive_kernel( + const unsigned int * row_indices, + const unsigned int * column_indices, + const unsigned int * row_blocks, + const NumericT * elements, + unsigned int num_blocks, + const NumericT * p, + NumericT * Ap, + const NumericT * r0star, + unsigned int size, + NumericT * inner_prod_buffer, + unsigned int buffer_size, + unsigned int buffer_offset) +{ + NumericT inner_prod_ApAp = 0; + NumericT inner_prod_pAp = 0; + NumericT inner_prod_r0Ap = 0; + + __shared__ NumericT shared_elements[1024]; + + for (unsigned int block_id = blockIdx.x; block_id < num_blocks; block_id += gridDim.x) + { + unsigned int row_start = row_blocks[block_id]; + unsigned int row_stop = row_blocks[block_id + 1]; + unsigned int element_start = row_indices[row_start]; + unsigned int element_stop = row_indices[row_stop]; + unsigned int rows_to_process = row_stop - row_start; + + if (rows_to_process > 1) // CSR stream with one thread per row + { + // load to shared buffer: + for (unsigned int i = element_start + threadIdx.x; i < element_stop; i += blockDim.x) + shared_elements[i - element_start] = elements[i] * p[column_indices[i]]; + + __syncthreads(); + + // use one thread per row to sum: + for (unsigned int row = row_start + threadIdx.x; row < row_stop; row += blockDim.x) + { + NumericT dot_prod = 0; + unsigned int thread_row_start = row_indices[row] - element_start; + unsigned int thread_row_stop = row_indices[row + 1] - element_start; + for (unsigned int i = thread_row_start; i < thread_row_stop; ++i) + dot_prod += shared_elements[i]; + Ap[row] = dot_prod; + inner_prod_ApAp += dot_prod * dot_prod; + inner_prod_pAp += p[row] * dot_prod; + inner_prod_r0Ap += r0star[row] * dot_prod; + } + } + // TODO here: Consider CSR vector for two to four rows (cf. OpenCL implementation. Experience on Fermi suggests that this may not be necessary) + else // CSR vector for a single row + { + // load and sum to shared buffer: + shared_elements[threadIdx.x] = 0; + for (unsigned int i = element_start + threadIdx.x; i < element_stop; i += blockDim.x) + shared_elements[threadIdx.x] += elements[i] * p[column_indices[i]]; + + // reduction to obtain final result + for (unsigned int stride = blockDim.x/2; stride > 0; stride /= 2) + { + __syncthreads(); + if (threadIdx.x < stride) + shared_elements[threadIdx.x] += shared_elements[threadIdx.x+stride]; + } + + if (threadIdx.x == 0) + { + Ap[row_start] = shared_elements[0]; + inner_prod_ApAp += shared_elements[0] * shared_elements[0]; + inner_prod_pAp += p[row_start] * shared_elements[0]; + inner_prod_r0Ap += r0star[row_start] * shared_elements[0]; + } + } + + __syncthreads(); // avoid race conditions + } + + ////////// parallel reduction in work group + __shared__ NumericT shared_array_ApAp[256]; + __shared__ NumericT shared_array_pAp[256]; + __shared__ NumericT shared_array_r0Ap[256]; + shared_array_ApAp[threadIdx.x] = inner_prod_ApAp; + shared_array_pAp[threadIdx.x] = inner_prod_pAp; + shared_array_r0Ap[threadIdx.x] = inner_prod_r0Ap; + for (unsigned int stride=blockDim.x/2; stride > 0; stride /= 2) + { + __syncthreads(); + if (threadIdx.x < stride) + { + shared_array_ApAp[threadIdx.x] += shared_array_ApAp[threadIdx.x + stride]; + shared_array_pAp[threadIdx.x] += shared_array_pAp[threadIdx.x + stride]; + shared_array_r0Ap[threadIdx.x] += shared_array_r0Ap[threadIdx.x + stride]; + } + } + + // write results to result array + if (threadIdx.x == 0) { + inner_prod_buffer[ buffer_size + blockIdx.x] = shared_array_ApAp[0]; + inner_prod_buffer[2*buffer_size + blockIdx.x] = shared_array_pAp[0]; + inner_prod_buffer[buffer_offset + blockIdx.x] = shared_array_r0Ap[0]; + } +} + + + + +template<typename NumericT> +void pipelined_bicgstab_prod(compressed_matrix<NumericT> const & A, + vector_base<NumericT> const & p, + vector_base<NumericT> & Ap, + vector_base<NumericT> const & r0star, + vector_base<NumericT> & inner_prod_buffer, + vcl_size_t buffer_chunk_size, + vcl_size_t buffer_chunk_offset) +{ + unsigned int vec_size = static_cast<unsigned int>(viennacl::traits::size(p)); + unsigned int chunk_size = static_cast<unsigned int>(buffer_chunk_size); + unsigned int chunk_offset = static_cast<unsigned int>(buffer_chunk_offset); + +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 500 + if (double(A.nnz()) / double(A.size1()) > 6.4) // less than 10% of threads expected to idle + { + pipelined_bicgstab_csr_vec_mul_blocked_kernel<8, NumericT><<<256, 256>>>( // experience on a GTX 750 Ti suggests that 8 is a substantially better choice here +#else + if (double(A.nnz()) / double(A.size1()) > 12.0) // less than 25% of threads expected to idle + { + pipelined_bicgstab_csr_vec_mul_blocked_kernel<16, NumericT><<<256, 256>>>( // Fermi and Kepler prefer 16 threads per row (half-warp) +#endif + viennacl::cuda_arg<unsigned int>(A.handle1()), + viennacl::cuda_arg<unsigned int>(A.handle2()), + viennacl::cuda_arg<NumericT>(A.handle()), + viennacl::cuda_arg(p), + viennacl::cuda_arg(Ap), + viennacl::cuda_arg(r0star), + vec_size, + viennacl::cuda_arg(inner_prod_buffer), + chunk_size, + chunk_offset + ); + VIENNACL_CUDA_LAST_ERROR_CHECK("pipelined_cg_csr_vec_mul_blocked_kernel"); + } + else + { + pipelined_bicgstab_csr_vec_mul_adaptive_kernel<<<256, 256>>>(viennacl::cuda_arg<unsigned int>(A.handle1()), + viennacl::cuda_arg<unsigned int>(A.handle2()), + viennacl::cuda_arg<unsigned int>(A.handle3()), + viennacl::cuda_arg<NumericT>(A.handle()), + static_cast<unsigned int>(A.blocks1()), + viennacl::cuda_arg(p), + viennacl::cuda_arg(Ap), + viennacl::cuda_arg(r0star), + vec_size, + viennacl::cuda_arg(inner_prod_buffer), + chunk_size, + chunk_offset); + VIENNACL_CUDA_LAST_ERROR_CHECK("pipelined_bicgstab_csr_vec_mul_adaptive_kernel"); + } +} + + +// +// Coordinate Matrix +// + + +template<typename NumericT> +__global__ void pipelined_bicgstab_coo_vec_mul_kernel(const unsigned int * coords, //(row_index, column_index) + const NumericT * elements, + const unsigned int * group_boundaries, + const NumericT * p, + NumericT * Ap, + const NumericT * r0star, + unsigned int size, + NumericT * inner_prod_buffer, + unsigned int buffer_size, + unsigned int buffer_offset) +{ + NumericT inner_prod_ApAp = 0; + NumericT inner_prod_pAp = 0; + NumericT inner_prod_r0Ap = 0; + __shared__ unsigned int shared_rows[128]; + __shared__ NumericT inter_results[128]; + + uint2 tmp; + NumericT val; + unsigned int group_start = group_boundaries[blockIdx.x]; + unsigned int group_end = group_boundaries[blockIdx.x + 1]; + unsigned int k_end = (group_end > group_start) ? 1 + (group_end - group_start - 1) / blockDim.x : 0; // -1 in order to have correct behavior if group_end - group_start == j * blockDim.x + + unsigned int local_index = 0; + + for (unsigned int k = 0; k < k_end; ++k) + { + local_index = group_start + k * blockDim.x + threadIdx.x; + + tmp = (local_index < group_end) ? ((const uint2 *)coords)[local_index] : ::make_uint2(0, 0); + val = (local_index < group_end) ? elements[local_index] * p[tmp.y] : 0; + + //check for carry from previous loop run: + if (threadIdx.x == 0 && k > 0) + { + if (tmp.x == shared_rows[blockDim.x-1]) + val += inter_results[blockDim.x-1]; + else + { + NumericT Ap_entry = inter_results[blockDim.x-1]; + Ap[shared_rows[blockDim.x-1]] = Ap_entry; + inner_prod_ApAp += Ap_entry * Ap_entry; + inner_prod_pAp += Ap_entry * p[shared_rows[blockDim.x-1]]; + inner_prod_r0Ap += r0star[shared_rows[blockDim.x-1]] * Ap_entry; + } + } + + //segmented parallel reduction begin + __syncthreads(); + shared_rows[threadIdx.x] = tmp.x; + inter_results[threadIdx.x] = val; + NumericT left = 0; + __syncthreads(); + + for (unsigned int stride = 1; stride < blockDim.x; stride *= 2) + { + left = (threadIdx.x >= stride && tmp.x == shared_rows[threadIdx.x - stride]) ? inter_results[threadIdx.x - stride] : 0; + __syncthreads(); + inter_results[threadIdx.x] += left; + __syncthreads(); + } + //segmented parallel reduction end + + if (local_index < group_end && threadIdx.x < blockDim.x-1 && + shared_rows[threadIdx.x] != shared_rows[threadIdx.x + 1]) + { + NumericT Ap_entry = inter_results[threadIdx.x]; + Ap[tmp.x] = Ap_entry; + inner_prod_ApAp += Ap_entry * Ap_entry; + inner_prod_pAp += Ap_entry * p[tmp.x]; + inner_prod_r0Ap += r0star[tmp.x] * Ap_entry; + } + + __syncthreads(); + } //for k + + if (local_index + 1 == group_end) + { + NumericT Ap_entry = inter_results[threadIdx.x]; + Ap[tmp.x] = Ap_entry; + inner_prod_ApAp += Ap_entry * Ap_entry; + inner_prod_pAp += Ap_entry * p[tmp.x]; + inner_prod_r0Ap += Ap_entry * r0star[tmp.x]; + } + + ////////// parallel reduction in work group + __shared__ NumericT shared_array_ApAp[256]; + __shared__ NumericT shared_array_pAp[256]; + __shared__ NumericT shared_array_r0Ap[256]; + shared_array_ApAp[threadIdx.x] = inner_prod_ApAp; + shared_array_pAp[threadIdx.x] = inner_prod_pAp; + shared_array_r0Ap[threadIdx.x] = inner_prod_r0Ap; + for (unsigned int stride=blockDim.x/2; stride > 0; stride /= 2) + { + __syncthreads(); + if (threadIdx.x < stride) + { + shared_array_ApAp[threadIdx.x] += shared_array_ApAp[threadIdx.x + stride]; + shared_array_pAp[threadIdx.x] += shared_array_pAp[threadIdx.x + stride]; + shared_array_r0Ap[threadIdx.x] += shared_array_r0Ap[threadIdx.x + stride]; + } + } + + // write results to result array + if (threadIdx.x == 0) { + inner_prod_buffer[ buffer_size + blockIdx.x] = shared_array_ApAp[0]; + inner_prod_buffer[2*buffer_size + blockIdx.x] = shared_array_pAp[0]; + inner_prod_buffer[buffer_offset + blockIdx.x] = shared_array_r0Ap[0]; + } + +} + + +template<typename NumericT> +void pipelined_bicgstab_prod(coordinate_matrix<NumericT> const & A, + vector_base<NumericT> const & p, + vector_base<NumericT> & Ap, + vector_base<NumericT> const & r0star, + vector_base<NumericT> & inner_prod_buffer, + vcl_size_t buffer_chunk_size, + vcl_size_t buffer_chunk_offset) +{ + unsigned int vec_size = static_cast<unsigned int>(viennacl::traits::size(p)); + unsigned int chunk_size = static_cast<unsigned int>(buffer_chunk_size); + unsigned int chunk_offset = static_cast<unsigned int>(buffer_chunk_offset); + + Ap.clear(); + + pipelined_bicgstab_coo_vec_mul_kernel<<<64, 128>>>(viennacl::cuda_arg<unsigned int>(A.handle12()), + viennacl::cuda_arg<NumericT>(A.handle()), + viennacl::cuda_arg<unsigned int>(A.handle3()), + viennacl::cuda_arg(p), + viennacl::cuda_arg(Ap), + viennacl::cuda_arg(r0star), + vec_size, + viennacl::cuda_arg(inner_prod_buffer), + chunk_size, + chunk_offset); + VIENNACL_CUDA_LAST_ERROR_CHECK("pipelined_bicgstab_coo_vec_mul_kernel"); +} + + + +// +// ELL Matrix +// + +template<typename NumericT> +__global__ void pipelined_bicgstab_ell_vec_mul_kernel(const unsigned int * coords, + const NumericT * elements, + unsigned int internal_row_num, + unsigned int items_per_row, + const NumericT * p, + NumericT * Ap, + const NumericT * r0star, + unsigned int size, + NumericT * inner_prod_buffer, + unsigned int buffer_size, + unsigned int buffer_offset) +{ + NumericT inner_prod_ApAp = 0; + NumericT inner_prod_pAp = 0; + NumericT inner_prod_r0Ap = 0; + unsigned int glb_id = blockDim.x * blockIdx.x + threadIdx.x; + unsigned int glb_sz = gridDim.x * blockDim.x; + + for (unsigned int row = glb_id; row < size; row += glb_sz) + { + NumericT sum = 0; + + unsigned int offset = row; + for (unsigned int item_id = 0; item_id < items_per_row; item_id++, offset += internal_row_num) + { + NumericT val = elements[offset]; + sum += val ? p[coords[offset]] * val : NumericT(0); + } + + Ap[row] = sum; + inner_prod_ApAp += sum * sum; + inner_prod_pAp += sum * p[row]; + inner_prod_r0Ap += sum * r0star[row]; + } + + ////////// parallel reduction in work group + __shared__ NumericT shared_array_ApAp[256]; + __shared__ NumericT shared_array_pAp[256]; + __shared__ NumericT shared_array_r0Ap[256]; + shared_array_ApAp[threadIdx.x] = inner_prod_ApAp; + shared_array_pAp[threadIdx.x] = inner_prod_pAp; + shared_array_r0Ap[threadIdx.x] = inner_prod_r0Ap; + for (unsigned int stride=blockDim.x/2; stride > 0; stride /= 2) + { + __syncthreads(); + if (threadIdx.x < stride) + { + shared_array_ApAp[threadIdx.x] += shared_array_ApAp[threadIdx.x + stride]; + shared_array_pAp[threadIdx.x] += shared_array_pAp[threadIdx.x + stride]; + shared_array_r0Ap[threadIdx.x] += shared_array_r0Ap[threadIdx.x + stride]; + } + } + + // write results to result array + if (threadIdx.x == 0) { + inner_prod_buffer[ buffer_size + blockIdx.x] = shared_array_ApAp[0]; + inner_prod_buffer[2*buffer_size + blockIdx.x] = shared_array_pAp[0]; + inner_prod_buffer[buffer_offset + blockIdx.x] = shared_array_r0Ap[0]; + } +} + + +template<typename NumericT> +void pipelined_bicgstab_prod(ell_matrix<NumericT> const & A, + vector_base<NumericT> const & p, + vector_base<NumericT> & Ap, + vector_base<NumericT> const & r0star, + vector_base<NumericT> & inner_prod_buffer, + vcl_size_t buffer_chunk_size, + vcl_size_t buffer_chunk_offset) +{ + unsigned int vec_size = static_cast<unsigned int>(viennacl::traits::size(p)); + unsigned int chunk_size = static_cast<unsigned int>(buffer_chunk_size); + unsigned int chunk_offset = static_cast<unsigned int>(buffer_chunk_offset); + + pipelined_bicgstab_ell_vec_mul_kernel<<<256, 256>>>(viennacl::cuda_arg<unsigned int>(A.handle2()), + viennacl::cuda_arg<NumericT>(A.handle()), + static_cast<unsigned int>(A.internal_size1()), + static_cast<unsigned int>(A.maxnnz()), + viennacl::cuda_arg(p), + viennacl::cuda_arg(Ap), + viennacl::cuda_arg(r0star), + vec_size, + viennacl::cuda_arg(inner_prod_buffer), + chunk_size, + chunk_offset); + VIENNACL_CUDA_LAST_ERROR_CHECK("pipelined_bicgstab_ell_vec_mul_kernel"); +} + + +// +// SELL-C-\sigma Matrix +// + +template<typename NumericT> +__global__ void pipelined_bicgstab_sliced_ell_vec_mul_kernel(const unsigned int * columns_per_block, + const unsigned int * column_indices, + const unsigned int * block_start, + const NumericT * elements, + const NumericT * p, + NumericT * Ap, + const NumericT * r0star, + unsigned int size, + unsigned int block_size, + NumericT * inner_prod_buffer, + unsigned int buffer_size, + unsigned int buffer_offset) +{ + NumericT inner_prod_ApAp = 0; + NumericT inner_prod_pAp = 0; + NumericT inner_prod_r0Ap = 0; + + unsigned int blocks_per_threadblock = blockDim.x / block_size; + unsigned int id_in_block = threadIdx.x % block_size; + unsigned int num_blocks = (size - 1) / block_size + 1; + unsigned int global_warp_count = blocks_per_threadblock * gridDim.x; + unsigned int global_warp_id = blocks_per_threadblock * blockIdx.x + threadIdx.x / block_size; + + for (unsigned int block_idx = global_warp_id; block_idx < num_blocks; block_idx += global_warp_count) + { + unsigned int row = block_idx * block_size + id_in_block; + unsigned int offset = block_start[block_idx]; + unsigned int num_columns = columns_per_block[block_idx]; + + NumericT sum = 0; + for (unsigned int item_id = 0; item_id < num_columns; item_id++) + { + unsigned int index = offset + item_id * block_size + id_in_block; + NumericT val = elements[index]; + + sum += val ? (p[column_indices[index]] * val) : 0; + } + + if (row < size) + { + Ap[row] = sum; + inner_prod_ApAp += sum * sum; + inner_prod_pAp += sum * p[row]; + inner_prod_r0Ap += sum * r0star[row]; + } + } + + ////////// parallel reduction in work group + __shared__ NumericT shared_array_ApAp[256]; + __shared__ NumericT shared_array_pAp[256]; + __shared__ NumericT shared_array_r0Ap[256]; + shared_array_ApAp[threadIdx.x] = inner_prod_ApAp; + shared_array_pAp[threadIdx.x] = inner_prod_pAp; + shared_array_r0Ap[threadIdx.x] = inner_prod_r0Ap; + for (unsigned int stride=blockDim.x/2; stride > 0; stride /= 2) + { + __syncthreads(); + if (threadIdx.x < stride) + { + shared_array_ApAp[threadIdx.x] += shared_array_ApAp[threadIdx.x + stride]; + shared_array_pAp[threadIdx.x] += shared_array_pAp[threadIdx.x + stride]; + shared_array_r0Ap[threadIdx.x] += shared_array_r0Ap[threadIdx.x + stride]; + } + } + + // write results to result array + if (threadIdx.x == 0) { + inner_prod_buffer[ buffer_size + blockIdx.x] = shared_array_ApAp[0]; + inner_prod_buffer[2*buffer_size + blockIdx.x] = shared_array_pAp[0]; + inner_prod_buffer[buffer_offset + blockIdx.x] = shared_array_r0Ap[0]; + } +} + +template<typename NumericT> +void pipelined_bicgstab_prod(sliced_ell_matrix<NumericT> const & A, + vector_base<NumericT> const & p, + vector_base<NumericT> & Ap, + vector_base<NumericT> const & r0star, + vector_base<NumericT> & inner_prod_buffer, + vcl_size_t buffer_chunk_size, + vcl_size_t buffer_chunk_offset) +{ + unsigned int vec_size = static_cast<unsigned int>(viennacl::traits::size(p)); + unsigned int chunk_size = static_cast<unsigned int>(buffer_chunk_size); + unsigned int chunk_offset = static_cast<unsigned int>(buffer_chunk_offset); + + pipelined_bicgstab_sliced_ell_vec_mul_kernel<<<256, 256>>>(viennacl::cuda_arg<unsigned int>(A.handle1()), + viennacl::cuda_arg<unsigned int>(A.handle2()), + viennacl::cuda_arg<unsigned int>(A.handle3()), + viennacl::cuda_arg<NumericT>(A.handle()), + viennacl::cuda_arg(p), + viennacl::cuda_arg(Ap), + viennacl::cuda_arg(r0star), + vec_size, + static_cast<unsigned int>(A.rows_per_block()), + viennacl::cuda_arg(inner_prod_buffer), + chunk_size, + chunk_offset); + VIENNACL_CUDA_LAST_ERROR_CHECK("pipelined_bicgstab_sliced_ell_vec_mul_kernel"); +} + + +// +// Hybrid Matrix +// + + +template<typename NumericT> +__global__ void pipelined_bicgstab_hyb_vec_mul_kernel(const unsigned int * ell_coords, + const NumericT * ell_elements, + const unsigned int * csr_rows, + const unsigned int * csr_cols, + const NumericT * csr_elements, + unsigned int internal_row_num, + unsigned int items_per_row, + const NumericT * p, + NumericT * Ap, + const NumericT * r0star, + unsigned int size, + NumericT * inner_prod_buffer, + unsigned int buffer_size, + unsigned int buffer_offset) +{ + NumericT inner_prod_ApAp = 0; + NumericT inner_prod_pAp = 0; + NumericT inner_prod_r0Ap = 0; + unsigned int glb_id = blockDim.x * blockIdx.x + threadIdx.x; + unsigned int glb_sz = gridDim.x * blockDim.x; + + for (unsigned int row = glb_id; row < size; row += glb_sz) + { + NumericT sum = 0; + + unsigned int offset = row; + for (unsigned int item_id = 0; item_id < items_per_row; item_id++, offset += internal_row_num) + { + NumericT val = ell_elements[offset]; + + sum += val ? p[ell_coords[offset]] * val : NumericT(0); + } + + unsigned int col_begin = csr_rows[row]; + unsigned int col_end = csr_rows[row + 1]; + + for (unsigned int item_id = col_begin; item_id < col_end; item_id++) + { + sum += p[csr_cols[item_id]] * csr_elements[item_id]; + } + + Ap[row] = sum; + inner_prod_ApAp += sum * sum; + inner_prod_pAp += sum * p[row]; + inner_prod_r0Ap += sum * r0star[row]; + } + + ////////// parallel reduction in work group + __shared__ NumericT shared_array_ApAp[256]; + __shared__ NumericT shared_array_pAp[256]; + __shared__ NumericT shared_array_r0Ap[256]; + shared_array_ApAp[threadIdx.x] = inner_prod_ApAp; + shared_array_pAp[threadIdx.x] = inner_prod_pAp; + shared_array_r0Ap[threadIdx.x] = inner_prod_r0Ap; + for (unsigned int stride=blockDim.x/2; stride > 0; stride /= 2) + { + __syncthreads(); + if (threadIdx.x < stride) + { + shared_array_ApAp[threadIdx.x] += shared_array_ApAp[threadIdx.x + stride]; + shared_array_pAp[threadIdx.x] += shared_array_pAp[threadIdx.x + stride]; + shared_array_r0Ap[threadIdx.x] += shared_array_r0Ap[threadIdx.x + stride]; + } + } + + // write results to result array + if (threadIdx.x == 0) { + inner_prod_buffer[ buffer_size + blockIdx.x] = shared_array_ApAp[0]; + inner_prod_buffer[2*buffer_size + blockIdx.x] = shared_array_pAp[0]; + inner_prod_buffer[buffer_offset + blockIdx.x] = shared_array_r0Ap[0]; + } +} + + + +template<typename NumericT> +void pipelined_bicgstab_prod(hyb_matrix<NumericT> const & A, + vector_base<NumericT> const & p, + vector_base<NumericT> & Ap, + vector_base<NumericT> const & r0star, + vector_base<NumericT> & inner_prod_buffer, + vcl_size_t buffer_chunk_size, + vcl_size_t buffer_chunk_offset) +{ + unsigned int vec_size = static_cast<unsigned int>(viennacl::traits::size(p)); + unsigned int chunk_size = static_cast<unsigned int>(buffer_chunk_size); + unsigned int chunk_offset = static_cast<unsigned int>(buffer_chunk_offset); + + pipelined_bicgstab_hyb_vec_mul_kernel<<<256, 256>>>(viennacl::cuda_arg<unsigned int>(A.handle2()), + viennacl::cuda_arg<NumericT>(A.handle()), + viennacl::cuda_arg<unsigned int>(A.handle3()), + viennacl::cuda_arg<unsigned int>(A.handle4()), + viennacl::cuda_arg<NumericT>(A.handle5()), + static_cast<unsigned int>(A.internal_size1()), + static_cast<unsigned int>(A.ell_nnz()), + viennacl::cuda_arg(p), + viennacl::cuda_arg(Ap), + viennacl::cuda_arg(r0star), + vec_size, + viennacl::cuda_arg(inner_prod_buffer), + chunk_size, + chunk_offset); + VIENNACL_CUDA_LAST_ERROR_CHECK("pipelined_bicgstab_hyb_vec_mul_kernel"); +} + +////////////////////////////////////////// + +template <typename T> +__global__ void pipelined_gmres_normalize_vk_kernel(T * vk, + unsigned int vk_offset, + T const * residual, + T * R_buffer, + unsigned int R_offset, + T const * inner_prod_buffer, + unsigned int chunk_size, + T * r_dot_vk_buffer, + unsigned int chunk_offset, + unsigned int size) +{ + __shared__ T shared_array[128]; + T norm_vk = 0; + + // parallel reduction in work group to compute <vk, vk> + shared_array[threadIdx.x] = inner_prod_buffer[threadIdx.x + chunk_size]; + for (unsigned int stride=blockDim.x/2; stride > 0; stride /= 2) + { + __syncthreads(); + if (threadIdx.x < stride) + shared_array[threadIdx.x] += shared_array[threadIdx.x + stride]; + } + + // compute alpha from reduced values: + __syncthreads(); + norm_vk = sqrt(shared_array[0]); + + T inner_prod_contrib = 0; + for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size; i += gridDim.x * blockDim.x) { + T value_vk = vk[i + vk_offset] / norm_vk; + + inner_prod_contrib += residual[i] * value_vk; + + vk[i + vk_offset] = value_vk; + } + __syncthreads(); + + // parallel reduction in work group + shared_array[threadIdx.x] = inner_prod_contrib; + for (unsigned int stride=blockDim.x/2; stride > 0; stride /= 2) + { + __syncthreads(); + if (threadIdx.x < stride) + shared_array[threadIdx.x] += shared_array[threadIdx.x + stride]; + } + + // write results of first reduction stage: + if (threadIdx.x == 0) + r_dot_vk_buffer[blockIdx.x + chunk_offset] = shared_array[0]; + // store norm: + if (blockDim.x * blockIdx.x + threadIdx.x == 0) + R_buffer[R_offset] = norm_vk; +} + +/** @brief Performs a vector normalization needed for an efficient pipelined GMRES algorithm. + * + * This routines computes for vectors 'r', 'v_k': + * Second reduction step for ||v_k|| + * v_k /= ||v_k|| + * First reduction step for <r, v_k> + */ +template <typename T> +void pipelined_gmres_normalize_vk(vector_base<T> & v_k, + vector_base<T> const & residual, + vector_base<T> & R_buffer, + vcl_size_t offset_in_R, + vector_base<T> const & inner_prod_buffer, + vector_base<T> & r_dot_vk_buffer, + vcl_size_t buffer_chunk_size, + vcl_size_t buffer_chunk_offset) +{ + unsigned int vk_offset = viennacl::traits::start(v_k); + unsigned int R_offset = offset_in_R; + unsigned int chunk_size = buffer_chunk_size; + unsigned int chunk_offset = buffer_chunk_offset; + unsigned int size = v_k.size(); + + pipelined_gmres_normalize_vk_kernel<<<128, 128>>>(viennacl::cuda_arg(v_k), + vk_offset, + viennacl::cuda_arg(residual), + viennacl::cuda_arg(R_buffer), + R_offset, + viennacl::cuda_arg(inner_prod_buffer), + chunk_size, + viennacl::cuda_arg(r_dot_vk_buffer), + chunk_offset, + size); + VIENNACL_CUDA_LAST_ERROR_CHECK("pipelined_gmres_normalize_vk_kernel"); +} + + + +template <typename T> +__global__ void pipelined_gmres_gram_schmidt_stage1_kernel(T const * krylov_basis, + unsigned int size, + unsigned int internal_size, + unsigned int k, + T * vi_in_vk_buffer, + unsigned int chunk_size) +{ + __shared__ T shared_array[7*128]; + T value_vk = 0; + + unsigned int k_base = 0; + while (k_base < k) + { + unsigned int vecs_in_iteration = (k - k_base > 7) ? 7 : (k - k_base); + + for (unsigned int j=0; j<vecs_in_iteration; ++j) + shared_array[threadIdx.x + j*chunk_size] = 0; + + for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size; i += gridDim.x * blockDim.x) + { + value_vk = krylov_basis[i + k * internal_size]; + + for (unsigned int j=0; j<vecs_in_iteration; ++j) + shared_array[threadIdx.x + j*chunk_size] += value_vk * krylov_basis[i + (k_base + j) * internal_size]; + } + + // parallel reduction in work group + for (unsigned int stride=blockDim.x/2; stride > 0; stride /= 2) + { + __syncthreads(); + if (threadIdx.x < stride) { + for (unsigned int j=0; j<vecs_in_iteration; ++j) + shared_array[threadIdx.x + j*chunk_size] += shared_array[threadIdx.x + j*chunk_size + stride]; + } + } + + // write results to result array + if (threadIdx.x == 0) + for (unsigned int j=0; j<vecs_in_iteration; ++j) + vi_in_vk_buffer[blockIdx.x + (k_base + j) * chunk_size] = shared_array[j*chunk_size]; + + k_base += vecs_in_iteration; + } + +} + +template <typename T> +void pipelined_gmres_gram_schmidt_stage1(vector_base<T> const & device_krylov_basis, + vcl_size_t v_k_size, + vcl_size_t v_k_internal_size, + vcl_size_t param_k, + vector_base<T> & vi_in_vk_buffer, + vcl_size_t buffer_chunk_size) +{ + unsigned int chunk_size = buffer_chunk_size; + unsigned int size = v_k_size; + unsigned int internal_size = v_k_internal_size; + unsigned int k = param_k; + + pipelined_gmres_gram_schmidt_stage1_kernel<<<128, 128>>>(viennacl::cuda_arg(device_krylov_basis), + size, + internal_size, + k, + viennacl::cuda_arg(vi_in_vk_buffer), + chunk_size); + VIENNACL_CUDA_LAST_ERROR_CHECK("pipelined_gmres_gram_schmidt_stage1_kernel"); +} + + + + +template <typename T> +__global__ void pipelined_gmres_gram_schmidt_stage2_kernel(T * krylov_basis, + unsigned int size, + unsigned int internal_size, + unsigned int k, + T const * vi_in_vk_buffer, + unsigned int chunk_size, + T * R_buffer, + unsigned int krylov_dim, + T * inner_prod_buffer) +{ + __shared__ T shared_array[7*128]; + T vk_dot_vk = 0; + T value_vk = 0; + + unsigned int k_base = 0; + while (k_base < k) + { + unsigned int vecs_in_iteration = (k - k_base > 7) ? 7 : (k - k_base); + + // parallel reduction in work group for <v_i, v_k> + for (unsigned int j=0; j<vecs_in_iteration; ++j) + shared_array[threadIdx.x + j*chunk_size] = vi_in_vk_buffer[threadIdx.x + (k_base + j) * chunk_size]; + for (unsigned int stride=blockDim.x/2; stride > 0; stride /= 2) + { + __syncthreads(); + if (threadIdx.x < stride) { + for (unsigned int j=0; j<vecs_in_iteration; ++j) + shared_array[threadIdx.x + j*chunk_size] += shared_array[threadIdx.x + j*chunk_size + stride]; + } + } + __syncthreads(); + + // v_k -= <v_i, v_k> v_i: + for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size; i += gridDim.x * blockDim.x) + { + value_vk = krylov_basis[i + k * internal_size]; + + for (unsigned int j=0; j<vecs_in_iteration; ++j) + value_vk -= shared_array[j*chunk_size] * krylov_basis[i + (k_base + j) * internal_size]; + vk_dot_vk += (k_base + vecs_in_iteration == k) ? (value_vk * value_vk) : 0; + krylov_basis[i + k * internal_size] = value_vk; + } + + // write to R: (to avoid thread divergence, all threads write the same value) + if (blockIdx.x == 0) + for (unsigned int j=0; j<vecs_in_iteration; ++j) + R_buffer[(k_base + j) + k*krylov_dim] = shared_array[j*chunk_size]; + __syncthreads(); + + k_base += vecs_in_iteration; + } + + // parallel reduction in work group for <v_k, v_k> + shared_array[threadIdx.x] = vk_dot_vk; + for (unsigned int stride=blockDim.x/2; stride > 0; stride /= 2) + { + __syncthreads(); + if (threadIdx.x < stride) + shared_array[threadIdx.x] += shared_array[threadIdx.x + stride]; + } + + // write results to result array + if (threadIdx.x == 0) + inner_prod_buffer[chunk_size+blockIdx.x] = shared_array[0]; +} + +template <typename T> +void pipelined_gmres_gram_schmidt_stage2(vector_base<T> & device_krylov_basis, + vcl_size_t v_k_size, + vcl_size_t v_k_internal_size, + vcl_size_t param_k, + vector_base<T> const & vi_in_vk_buffer, + vector_base<T> & R_buffer, + vcl_size_t krylov_dim, + vector_base<T> & inner_prod_buffer, + vcl_size_t buffer_chunk_size) +{ + unsigned int chunk_size = buffer_chunk_size; + unsigned int size = v_k_size; + unsigned int internal_size = v_k_internal_size; + unsigned int k = param_k; + unsigned int krylov = krylov_dim; + + pipelined_gmres_gram_schmidt_stage2_kernel<<<128, 128>>>(viennacl::cuda_arg(device_krylov_basis), + size, + internal_size, + k, + viennacl::cuda_arg(vi_in_vk_buffer), + chunk_size, + viennacl::cuda_arg(R_buffer), + krylov, + viennacl::cuda_arg(inner_prod_buffer)); + VIENNACL_CUDA_LAST_ERROR_CHECK("pipelined_gmres_gram_schmidt_stage2_kernel"); +} + + + + +template <typename T> +__global__ void pipelined_gmres_update_result_kernel(T * result, + T const * residual, + T const * krylov_basis, + unsigned int size, + unsigned int internal_size, + T const * coefficients, + unsigned int k) +{ + for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size; i += gridDim.x * blockDim.x) + { + T value_result = result[i] + coefficients[0] * residual[i]; + + for (unsigned int j = 1; j < k; ++j) + value_result += coefficients[j] * krylov_basis[i + (j-1)*internal_size]; + + result[i] = value_result; + } +} + +template <typename T> +void pipelined_gmres_update_result(vector_base<T> & result, + vector_base<T> const & residual, + vector_base<T> const & krylov_basis, + vcl_size_t v_k_size, + vcl_size_t v_k_internal_size, + vector_base<T> const & coefficients, + vcl_size_t param_k) +{ + unsigned int size = v_k_size; + unsigned int internal_size = v_k_internal_size; + unsigned int k = param_k; + + pipelined_gmres_update_result_kernel<<<128, 128>>>(viennacl::cuda_arg(result), + viennacl::cuda_arg(residual), + viennacl::cuda_arg(krylov_basis), + size, + internal_size, + viennacl::cuda_arg(coefficients), + k); + VIENNACL_CUDA_LAST_ERROR_CHECK("pipelined_gmres_update_result_kernel"); +} + + + +template <typename NumericT> +void pipelined_gmres_prod(compressed_matrix<NumericT> const & A, + vector_base<NumericT> const & p, + vector_base<NumericT> & Ap, + vector_base<NumericT> & inner_prod_buffer) +{ + unsigned int size = p.size(); + unsigned int buffer_size_per_vector = static_cast<unsigned int>(inner_prod_buffer.size()) / static_cast<unsigned int>(3); + +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 500 + if (double(A.nnz()) / double(A.size1()) > 6.4) // less than 10% of threads expected to idle + { + pipelined_cg_csr_vec_mul_blocked_kernel<8, NumericT><<<256, 256>>>( // experience on a GTX 750 Ti suggests that 8 is a substantially better choice here +#else + if (double(A.nnz()) / double(A.size1()) > 12.0) // less than 25% of threads expected to idle + { + pipelined_cg_csr_vec_mul_blocked_kernel<16, NumericT><<<128, 256>>>( // Fermi and Kepler prefer 16 threads per row (half-warp) +#endif + viennacl::cuda_arg<unsigned int>(A.handle1()), + viennacl::cuda_arg<unsigned int>(A.handle2()), + viennacl::cuda_arg<NumericT>(A.handle()), + viennacl::cuda_arg(p) + viennacl::traits::start(p), + viennacl::cuda_arg(Ap) + viennacl::traits::start(Ap), + size, + viennacl::cuda_arg(inner_prod_buffer), + buffer_size_per_vector + ); + VIENNACL_CUDA_LAST_ERROR_CHECK("pipelined_cg_csr_vec_mul_blocked_kernel"); + } + else + { + pipelined_cg_csr_vec_mul_adaptive_kernel<<<128, 256>>>(viennacl::cuda_arg<unsigned int>(A.handle1()), + viennacl::cuda_arg<unsigned int>(A.handle2()), + viennacl::cuda_arg<unsigned int>(A.handle3()), + viennacl::cuda_arg<NumericT>(A.handle()), + static_cast<unsigned int>(A.blocks1()), + viennacl::cuda_arg(p) + viennacl::traits::start(p), + viennacl::cuda_arg(Ap) + viennacl::traits::start(Ap), + size, + viennacl::cuda_arg(inner_prod_buffer), + buffer_size_per_vector); + VIENNACL_CUDA_LAST_ERROR_CHECK("pipelined_cg_csr_vec_mul_adaptive_kernel"); + } + +} + +template <typename T> +void pipelined_gmres_prod(coordinate_matrix<T> const & A, + vector_base<T> const & p, + vector_base<T> & Ap, + vector_base<T> & inner_prod_buffer) +{ + unsigned int size = p.size(); + unsigned int buffer_size_per_vector = static_cast<unsigned int>(inner_prod_buffer.size()) / static_cast<unsigned int>(3); + + Ap.clear(); + + pipelined_cg_coo_vec_mul_kernel<<<64, 128>>>(viennacl::cuda_arg<unsigned int>(A.handle12()), + viennacl::cuda_arg<T>(A.handle()), + viennacl::cuda_arg<unsigned int>(A.handle3()), + viennacl::cuda_arg(p) + viennacl::traits::start(p), + viennacl::cuda_arg(Ap) + viennacl::traits::start(Ap), + size, + viennacl::cuda_arg(inner_prod_buffer), + buffer_size_per_vector); + VIENNACL_CUDA_LAST_ERROR_CHECK("pipelined_cg_coo_vec_mul_kernel"); +} + +template <typename T> +void pipelined_gmres_prod(ell_matrix<T> const & A, + vector_base<T> const & p, + vector_base<T> & Ap, + vector_base<T> & inner_prod_buffer) +{ + unsigned int size = p.size(); + unsigned int buffer_size_per_vector = static_cast<unsigned int>(inner_prod_buffer.size()) / static_cast<unsigned int>(3); + + pipelined_cg_ell_vec_mul_kernel<<<128, 256>>>(viennacl::cuda_arg<unsigned int>(A.handle2()), + viennacl::cuda_arg<T>(A.handle()), + static_cast<unsigned int>(A.internal_size1()), + static_cast<unsigned int>(A.maxnnz()), + viennacl::cuda_arg(p) + viennacl::traits::start(p), + viennacl::cuda_arg(Ap) + viennacl::traits::start(Ap), + size, + viennacl::cuda_arg(inner_prod_buffer), + buffer_size_per_vector); + VIENNACL_CUDA_LAST_ERROR_CHECK("pipelined_cg_ell_vec_mul_kernel"); +} + +template <typename T> +void pipelined_gmres_prod(sliced_ell_matrix<T> const & A, + vector_base<T> const & p, + vector_base<T> & Ap, + vector_base<T> & inner_prod_buffer) +{ + unsigned int size = p.size(); + unsigned int buffe
<TRUNCATED>
