http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/sparse_matrix_operations_solve.hpp ---------------------------------------------------------------------- diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/sparse_matrix_operations_solve.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/sparse_matrix_operations_solve.hpp new file mode 100644 index 0000000..24bcf96 --- /dev/null +++ b/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/sparse_matrix_operations_solve.hpp @@ -0,0 +1,761 @@ +#ifndef VIENNACL_LINALG_CUDA_SPARSE_MATRIX_OPERATIONS_SOLVE_HPP_ +#define VIENNACL_LINALG_CUDA_SPARSE_MATRIX_OPERATIONS_SOLVE_HPP_ + +/* ========================================================================= + Copyright (c) 2010-2016, Institute for Microelectronics, + Institute for Analysis and Scientific Computing, + TU Wien. + Portions of this software are copyright by UChicago Argonne, LLC. + + ----------------- + ViennaCL - The Vienna Computing Library + ----------------- + + Project Head: Karl Rupp [email protected] + + (A list of authors and contributors can be found in the manual) + + License: MIT (X11), see file LICENSE in the base directory +============================================================================= */ + +/** @file viennacl/linalg/cuda/sparse_matrix_operations_solve.hpp + @brief Implementations of direct triangular solvers for sparse matrices using CUDA +*/ + +#include "viennacl/forwards.h" + +namespace viennacl +{ +namespace linalg +{ +namespace cuda +{ +// +// Compressed matrix +// + +// +// non-transposed +// + +template<typename NumericT> +__global__ void csr_unit_lu_forward_kernel( + const unsigned int * row_indices, + const unsigned int * column_indices, + const NumericT * elements, + NumericT * vector, + unsigned int size) +{ + __shared__ unsigned int col_index_buffer[128]; + __shared__ NumericT element_buffer[128]; + __shared__ NumericT vector_buffer[128]; + + unsigned int nnz = row_indices[size]; + unsigned int current_row = 0; + unsigned int row_at_window_start = 0; + NumericT current_vector_entry = vector[0]; + unsigned int loop_end = (nnz / blockDim.x + 1) * blockDim.x; + unsigned int next_row = row_indices[1]; + + for (unsigned int i = threadIdx.x; i < loop_end; i += blockDim.x) + { + //load into shared memory (coalesced access): + if (i < nnz) + { + element_buffer[threadIdx.x] = elements[i]; + unsigned int tmp = column_indices[i]; + col_index_buffer[threadIdx.x] = tmp; + vector_buffer[threadIdx.x] = vector[tmp]; + } + + __syncthreads(); + + //now a single thread does the remaining work in shared memory: + if (threadIdx.x == 0) + { + // traverse through all the loaded data: + for (unsigned int k=0; k<blockDim.x; ++k) + { + if (current_row < size && i+k == next_row) //current row is finished. Write back result + { + vector[current_row] = current_vector_entry; + ++current_row; + if (current_row < size) //load next row's data + { + next_row = row_indices[current_row+1]; + current_vector_entry = vector[current_row]; + } + } + + if (current_row < size && col_index_buffer[k] < current_row) //substitute + { + if (col_index_buffer[k] < row_at_window_start) //use recently computed results + current_vector_entry -= element_buffer[k] * vector_buffer[k]; + else if (col_index_buffer[k] < current_row) //use buffered data + current_vector_entry -= element_buffer[k] * vector[col_index_buffer[k]]; + } + + } // for k + + row_at_window_start = current_row; + } // if (get_local_id(0) == 0) + + __syncthreads(); + } //for i +} + + + +template<typename NumericT> +__global__ void csr_lu_forward_kernel( + const unsigned int * row_indices, + const unsigned int * column_indices, + const NumericT * elements, + NumericT * vector, + unsigned int size) +{ + __shared__ unsigned int col_index_buffer[128]; + __shared__ NumericT element_buffer[128]; + __shared__ NumericT vector_buffer[128]; + + unsigned int nnz = row_indices[size]; + unsigned int current_row = 0; + unsigned int row_at_window_start = 0; + NumericT current_vector_entry = vector[0]; + NumericT diagonal_entry = 0; + unsigned int loop_end = (nnz / blockDim.x + 1) * blockDim.x; + unsigned int next_row = row_indices[1]; + + for (unsigned int i = threadIdx.x; i < loop_end; i += blockDim.x) + { + //load into shared memory (coalesced access): + if (i < nnz) + { + element_buffer[threadIdx.x] = elements[i]; + unsigned int tmp = column_indices[i]; + col_index_buffer[threadIdx.x] = tmp; + vector_buffer[threadIdx.x] = vector[tmp]; + } + + __syncthreads(); + + //now a single thread does the remaining work in shared memory: + if (threadIdx.x == 0) + { + // traverse through all the loaded data: + for (unsigned int k=0; k<blockDim.x; ++k) + { + if (current_row < size && i+k == next_row) //current row is finished. Write back result + { + vector[current_row] = current_vector_entry / diagonal_entry; + ++current_row; + if (current_row < size) //load next row's data + { + next_row = row_indices[current_row+1]; + current_vector_entry = vector[current_row]; + } + } + + if (current_row < size && col_index_buffer[k] < current_row) //substitute + { + if (col_index_buffer[k] < row_at_window_start) //use recently computed results + current_vector_entry -= element_buffer[k] * vector_buffer[k]; + else if (col_index_buffer[k] < current_row) //use buffered data + current_vector_entry -= element_buffer[k] * vector[col_index_buffer[k]]; + } + else if (col_index_buffer[k] == current_row) + diagonal_entry = element_buffer[k]; + + } // for k + + row_at_window_start = current_row; + } // if (get_local_id(0) == 0) + + __syncthreads(); + } //for i +} + + +template<typename NumericT> +__global__ void csr_unit_lu_backward_kernel( + const unsigned int * row_indices, + const unsigned int * column_indices, + const NumericT * elements, + NumericT * vector, + unsigned int size) +{ + __shared__ unsigned int col_index_buffer[128]; + __shared__ NumericT element_buffer[128]; + __shared__ NumericT vector_buffer[128]; + + unsigned int nnz = row_indices[size]; + unsigned int current_row = size-1; + unsigned int row_at_window_start = size-1; + NumericT current_vector_entry = vector[size-1]; + unsigned int loop_end = ( (nnz - 1) / blockDim.x) * blockDim.x; + unsigned int next_row = row_indices[size-1]; + + unsigned int i = loop_end + threadIdx.x; + while (1) + { + //load into shared memory (coalesced access): + if (i < nnz) + { + element_buffer[threadIdx.x] = elements[i]; + unsigned int tmp = column_indices[i]; + col_index_buffer[threadIdx.x] = tmp; + vector_buffer[threadIdx.x] = vector[tmp]; + } + + __syncthreads(); + + //now a single thread does the remaining work in shared memory: + if (threadIdx.x == 0) + { + // traverse through all the loaded data from back to front: + for (unsigned int k2=0; k2<blockDim.x; ++k2) + { + unsigned int k = (blockDim.x - k2) - 1; + + if (i+k >= nnz) + continue; + + if (col_index_buffer[k] > row_at_window_start) //use recently computed results + current_vector_entry -= element_buffer[k] * vector_buffer[k]; + else if (col_index_buffer[k] > current_row) //use buffered data + current_vector_entry -= element_buffer[k] * vector[col_index_buffer[k]]; + + if (i+k == next_row) //current row is finished. Write back result + { + vector[current_row] = current_vector_entry; + if (current_row > 0) //load next row's data + { + --current_row; + next_row = row_indices[current_row]; + current_vector_entry = vector[current_row]; + } + } + + + } // for k + + row_at_window_start = current_row; + } // if (get_local_id(0) == 0) + + __syncthreads(); + + if (i < blockDim.x) + break; + + i -= blockDim.x; + } //for i +} + + + +template<typename NumericT> +__global__ void csr_lu_backward_kernel( + const unsigned int * row_indices, + const unsigned int * column_indices, + const NumericT * elements, + NumericT * vector, + unsigned int size) +{ + __shared__ unsigned int col_index_buffer[128]; + __shared__ NumericT element_buffer[128]; + __shared__ NumericT vector_buffer[128]; + + unsigned int nnz = row_indices[size]; + unsigned int current_row = size-1; + unsigned int row_at_window_start = size-1; + NumericT current_vector_entry = vector[size-1]; + NumericT diagonal_entry; + unsigned int loop_end = ( (nnz - 1) / blockDim.x) * blockDim.x; + unsigned int next_row = row_indices[size-1]; + + unsigned int i = loop_end + threadIdx.x; + while (1) + { + //load into shared memory (coalesced access): + if (i < nnz) + { + element_buffer[threadIdx.x] = elements[i]; + unsigned int tmp = column_indices[i]; + col_index_buffer[threadIdx.x] = tmp; + vector_buffer[threadIdx.x] = vector[tmp]; + } + + __syncthreads(); + + //now a single thread does the remaining work in shared memory: + if (threadIdx.x == 0) + { + // traverse through all the loaded data from back to front: + for (unsigned int k2=0; k2<blockDim.x; ++k2) + { + unsigned int k = (blockDim.x - k2) - 1; + + if (i+k >= nnz) + continue; + + if (col_index_buffer[k] > row_at_window_start) //use recently computed results + current_vector_entry -= element_buffer[k] * vector_buffer[k]; + else if (col_index_buffer[k] > current_row) //use buffered data + current_vector_entry -= element_buffer[k] * vector[col_index_buffer[k]]; + else if (col_index_buffer[k] == current_row) + diagonal_entry = element_buffer[k]; + + if (i+k == next_row) //current row is finished. Write back result + { + vector[current_row] = current_vector_entry / diagonal_entry; + if (current_row > 0) //load next row's data + { + --current_row; + next_row = row_indices[current_row]; + current_vector_entry = vector[current_row]; + } + } + + + } // for k + + row_at_window_start = current_row; + } // if (get_local_id(0) == 0) + + __syncthreads(); + + if (i < blockDim.x) + break; + + i -= blockDim.x; + } //for i +} + + + +// +// transposed +// + + +template<typename NumericT> +__global__ void csr_trans_lu_forward_kernel2( + const unsigned int * row_indices, + const unsigned int * column_indices, + const NumericT * elements, + NumericT * vector, + unsigned int size) +{ + for (unsigned int row = 0; row < size; ++row) + { + NumericT result_entry = vector[row]; + + unsigned int row_start = row_indices[row]; + unsigned int row_stop = row_indices[row + 1]; + for (unsigned int entry_index = row_start + threadIdx.x; entry_index < row_stop; entry_index += blockDim.x) + { + unsigned int col_index = column_indices[entry_index]; + if (col_index > row) + vector[col_index] -= result_entry * elements[entry_index]; + } + + __syncthreads(); + } +} + +template<typename NumericT> +__global__ void csr_trans_unit_lu_forward_kernel( + const unsigned int * row_indices, + const unsigned int * column_indices, + const NumericT * elements, + NumericT * vector, + unsigned int size) +{ + __shared__ unsigned int row_index_lookahead[256]; + __shared__ unsigned int row_index_buffer[256]; + + unsigned int row_index; + unsigned int col_index; + NumericT matrix_entry; + unsigned int nnz = row_indices[size]; + unsigned int row_at_window_start = 0; + unsigned int row_at_window_end = 0; + unsigned int loop_end = ( (nnz - 1) / blockDim.x + 1) * blockDim.x; + + for (unsigned int i = threadIdx.x; i < loop_end; i += blockDim.x) + { + col_index = (i < nnz) ? column_indices[i] : 0; + matrix_entry = (i < nnz) ? elements[i] : 0; + row_index_lookahead[threadIdx.x] = (row_at_window_start + threadIdx.x < size) ? row_indices[row_at_window_start + threadIdx.x] : nnz; + + __syncthreads(); + + if (i < nnz) + { + unsigned int row_index_inc = 0; + while (i >= row_index_lookahead[row_index_inc + 1]) + ++row_index_inc; + row_index = row_at_window_start + row_index_inc; + row_index_buffer[threadIdx.x] = row_index; + } + else + { + row_index = size+1; + row_index_buffer[threadIdx.x] = size - 1; + } + + __syncthreads(); + + row_at_window_start = row_index_buffer[0]; + row_at_window_end = row_index_buffer[blockDim.x - 1]; + + //forward elimination + for (unsigned int row = row_at_window_start; row <= row_at_window_end; ++row) + { + NumericT result_entry = vector[row]; + + if ( (row_index == row) && (col_index > row) ) + vector[col_index] -= result_entry * matrix_entry; + + __syncthreads(); + } + + row_at_window_start = row_at_window_end; + } + +} + +template<typename NumericT> +__global__ void csr_trans_lu_forward_kernel( + const unsigned int * row_indices, + const unsigned int * column_indices, + const NumericT * elements, + const NumericT * diagonal_entries, + NumericT * vector, + unsigned int size) +{ + __shared__ unsigned int row_index_lookahead[256]; + __shared__ unsigned int row_index_buffer[256]; + + unsigned int row_index; + unsigned int col_index; + NumericT matrix_entry; + unsigned int nnz = row_indices[size]; + unsigned int row_at_window_start = 0; + unsigned int row_at_window_end = 0; + unsigned int loop_end = ( (nnz - 1) / blockDim.x + 1) * blockDim.x; + + for (unsigned int i = threadIdx.x; i < loop_end; i += blockDim.x) + { + col_index = (i < nnz) ? column_indices[i] : 0; + matrix_entry = (i < nnz) ? elements[i] : 0; + row_index_lookahead[threadIdx.x] = (row_at_window_start + threadIdx.x < size) ? row_indices[row_at_window_start + threadIdx.x] : nnz; + + __syncthreads(); + + if (i < nnz) + { + unsigned int row_index_inc = 0; + while (i >= row_index_lookahead[row_index_inc + 1]) + ++row_index_inc; + row_index = row_at_window_start + row_index_inc; + row_index_buffer[threadIdx.x] = row_index; + } + else + { + row_index = size+1; + row_index_buffer[threadIdx.x] = size - 1; + } + + __syncthreads(); + + row_at_window_start = row_index_buffer[0]; + row_at_window_end = row_index_buffer[blockDim.x - 1]; + + //forward elimination + for (unsigned int row = row_at_window_start; row <= row_at_window_end; ++row) + { + NumericT result_entry = vector[row] / diagonal_entries[row]; + + if ( (row_index == row) && (col_index > row) ) + vector[col_index] -= result_entry * matrix_entry; + + __syncthreads(); + } + + row_at_window_start = row_at_window_end; + } + + // final step: Divide vector by diagonal entries: + for (unsigned int i = threadIdx.x; i < size; i += blockDim.x) + vector[i] /= diagonal_entries[i]; + +} + + +template<typename NumericT> +__global__ void csr_trans_unit_lu_backward_kernel( + const unsigned int * row_indices, + const unsigned int * column_indices, + const NumericT * elements, + NumericT * vector, + unsigned int size) +{ + __shared__ unsigned int row_index_lookahead[256]; + __shared__ unsigned int row_index_buffer[256]; + + unsigned int row_index; + unsigned int col_index; + NumericT matrix_entry; + unsigned int nnz = row_indices[size]; + unsigned int row_at_window_start = size; + unsigned int row_at_window_end; + unsigned int loop_end = ( (nnz - 1) / blockDim.x + 1) * blockDim.x; + + for (unsigned int i2 = threadIdx.x; i2 < loop_end; i2 += blockDim.x) + { + unsigned int i = (nnz - i2) - 1; + col_index = (i2 < nnz) ? column_indices[i] : 0; + matrix_entry = (i2 < nnz) ? elements[i] : 0; + row_index_lookahead[threadIdx.x] = (row_at_window_start >= threadIdx.x) ? row_indices[row_at_window_start - threadIdx.x] : 0; + + __syncthreads(); + + if (i2 < nnz) + { + unsigned int row_index_dec = 0; + while (row_index_lookahead[row_index_dec] > i) + ++row_index_dec; + row_index = row_at_window_start - row_index_dec; + row_index_buffer[threadIdx.x] = row_index; + } + else + { + row_index = size+1; + row_index_buffer[threadIdx.x] = 0; + } + + __syncthreads(); + + row_at_window_start = row_index_buffer[0]; + row_at_window_end = row_index_buffer[blockDim.x - 1]; + + //backward elimination + for (unsigned int row2 = 0; row2 <= (row_at_window_start - row_at_window_end); ++row2) + { + unsigned int row = row_at_window_start - row2; + NumericT result_entry = vector[row]; + + if ( (row_index == row) && (col_index < row) ) + vector[col_index] -= result_entry * matrix_entry; + + __syncthreads(); + } + + row_at_window_start = row_at_window_end; + } + +} + + + +template<typename NumericT> +__global__ void csr_trans_lu_backward_kernel2( + const unsigned int * row_indices, + const unsigned int * column_indices, + const NumericT * elements, + const NumericT * diagonal_entries, + NumericT * vector, + unsigned int size) +{ + NumericT result_entry = 0; + + //backward elimination, using U and D: + for (unsigned int row2 = 0; row2 < size; ++row2) + { + unsigned int row = (size - row2) - 1; + result_entry = vector[row] / diagonal_entries[row]; + + unsigned int row_start = row_indices[row]; + unsigned int row_stop = row_indices[row + 1]; + for (unsigned int entry_index = row_start + threadIdx.x; entry_index < row_stop; ++entry_index) + { + unsigned int col_index = column_indices[entry_index]; + if (col_index < row) + vector[col_index] -= result_entry * elements[entry_index]; + } + + __syncthreads(); + + if (threadIdx.x == 0) + vector[row] = result_entry; + } +} + + +template<typename NumericT> +__global__ void csr_trans_lu_backward_kernel( + const unsigned int * row_indices, + const unsigned int * column_indices, + const NumericT * elements, + const NumericT * diagonal_entries, + NumericT * vector, + unsigned int size) +{ + __shared__ unsigned int row_index_lookahead[256]; + __shared__ unsigned int row_index_buffer[256]; + + unsigned int row_index; + unsigned int col_index; + NumericT matrix_entry; + unsigned int nnz = row_indices[size]; + unsigned int row_at_window_start = size; + unsigned int row_at_window_end; + unsigned int loop_end = ( (nnz - 1) / blockDim.x + 1) * blockDim.x; + + for (unsigned int i2 = threadIdx.x; i2 < loop_end; i2 += blockDim.x) + { + unsigned int i = (nnz - i2) - 1; + col_index = (i2 < nnz) ? column_indices[i] : 0; + matrix_entry = (i2 < nnz) ? elements[i] : 0; + row_index_lookahead[threadIdx.x] = (row_at_window_start >= threadIdx.x) ? row_indices[row_at_window_start - threadIdx.x] : 0; + + __syncthreads(); + + if (i2 < nnz) + { + unsigned int row_index_dec = 0; + while (row_index_lookahead[row_index_dec] > i) + ++row_index_dec; + row_index = row_at_window_start - row_index_dec; + row_index_buffer[threadIdx.x] = row_index; + } + else + { + row_index = size+1; + row_index_buffer[threadIdx.x] = 0; + } + + __syncthreads(); + + row_at_window_start = row_index_buffer[0]; + row_at_window_end = row_index_buffer[blockDim.x - 1]; + + //backward elimination + for (unsigned int row2 = 0; row2 <= (row_at_window_start - row_at_window_end); ++row2) + { + unsigned int row = row_at_window_start - row2; + NumericT result_entry = vector[row] / diagonal_entries[row]; + + if ( (row_index == row) && (col_index < row) ) + vector[col_index] -= result_entry * matrix_entry; + + __syncthreads(); + } + + row_at_window_start = row_at_window_end; + } + + + // final step: Divide vector by diagonal entries: + for (unsigned int i = threadIdx.x; i < size; i += blockDim.x) + vector[i] /= diagonal_entries[i]; + +} + + +template<typename NumericT> +__global__ void csr_block_trans_unit_lu_forward( + const unsigned int * row_jumper_L, //L part (note that L is transposed in memory) + const unsigned int * column_indices_L, + const NumericT * elements_L, + const unsigned int * block_offsets, + NumericT * result, + unsigned int size) +{ + unsigned int col_start = block_offsets[2*blockIdx.x]; + unsigned int col_stop = block_offsets[2*blockIdx.x+1]; + unsigned int row_start = row_jumper_L[col_start]; + unsigned int row_stop; + NumericT result_entry = 0; + + if (col_start >= col_stop) + return; + + //forward elimination, using L: + for (unsigned int col = col_start; col < col_stop; ++col) + { + result_entry = result[col]; + row_stop = row_jumper_L[col + 1]; + for (unsigned int buffer_index = row_start + threadIdx.x; buffer_index < row_stop; buffer_index += blockDim.x) + result[column_indices_L[buffer_index]] -= result_entry * elements_L[buffer_index]; + row_start = row_stop; //for next iteration (avoid unnecessary loads from GPU RAM) + __syncthreads(); + } + +} + + +template<typename NumericT> +__global__ void csr_block_trans_lu_backward( + const unsigned int * row_jumper_U, //U part (note that U is transposed in memory) + const unsigned int * column_indices_U, + const NumericT * elements_U, + const NumericT * diagonal_U, + const unsigned int * block_offsets, + NumericT * result, + unsigned int size) +{ + unsigned int col_start = block_offsets[2*blockIdx.x]; + unsigned int col_stop = block_offsets[2*blockIdx.x+1]; + unsigned int row_start; + unsigned int row_stop; + NumericT result_entry = 0; + + if (col_start >= col_stop) + return; + + //backward elimination, using U and diagonal_U + for (unsigned int iter = 0; iter < col_stop - col_start; ++iter) + { + unsigned int col = (col_stop - iter) - 1; + result_entry = result[col] / diagonal_U[col]; + row_start = row_jumper_U[col]; + row_stop = row_jumper_U[col + 1]; + for (unsigned int buffer_index = row_start + threadIdx.x; buffer_index < row_stop; buffer_index += blockDim.x) + result[column_indices_U[buffer_index]] -= result_entry * elements_U[buffer_index]; + __syncthreads(); + } + + //divide result vector by diagonal: + for (unsigned int col = col_start + threadIdx.x; col < col_stop; col += blockDim.x) + result[col] /= diagonal_U[col]; +} + + + +// +// Coordinate Matrix +// + + + + +// +// ELL Matrix +// + + + +// +// Hybrid Matrix +// + + + +} // namespace opencl +} //namespace linalg +} //namespace viennacl + + +#endif
http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/spgemm.hpp ---------------------------------------------------------------------- diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/spgemm.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/spgemm.hpp new file mode 100644 index 0000000..5551cda --- /dev/null +++ b/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/spgemm.hpp @@ -0,0 +1,793 @@ +#ifndef VIENNACL_LINALG_CUDA_SPGEMM_HPP_ +#define VIENNACL_LINALG_CUDA_SPGEMM_HPP_ + +/* ========================================================================= + Copyright (c) 2010-2016, Institute for Microelectronics, + Institute for Analysis and Scientific Computing, + TU Wien. + Portions of this software are copyright by UChicago Argonne, LLC. + + ----------------- + ViennaCL - The Vienna Computing Library + ----------------- + + Project Head: Karl Rupp [email protected] + + (A list of authors and contributors can be found in the manual) + + License: MIT (X11), see file LICENSE in the base directory +============================================================================= */ + +/** @file viennacl/linalg/cuda/sparse_matrix_operations.hpp + @brief Implementations of operations using sparse matrices using CUDA +*/ + +#include <stdexcept> + +#include <thrust/scan.h> +#include <thrust/device_ptr.h> + +#include "viennacl/forwards.h" +#include "viennacl/scalar.hpp" +#include "viennacl/vector.hpp" +#include "viennacl/tools/tools.hpp" +#include "viennacl/linalg/cuda/common.hpp" + +#include "viennacl/tools/timer.hpp" + +#include "viennacl/linalg/cuda/sparse_matrix_operations_solve.hpp" + +namespace viennacl +{ +namespace linalg +{ +namespace cuda +{ + +/** @brief Loads a value from the specified address. With CUDA arch 3.5 and above the value is also stored in global constant memory for later reuse */ +template<typename NumericT> +static inline __device__ NumericT load_and_cache(const NumericT *address) +{ +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350 + return __ldg(address); +#else + return *address; +#endif +} + + +// +// Stage 1: Obtain upper bound for number of elements per row in C: +// +template<typename IndexT> +__device__ IndexT round_to_next_power_of_2(IndexT val) +{ + if (val > 32) + return 64; // just to indicate that we need to split/factor the matrix! + else if (val > 16) + return 32; + else if (val > 8) + return 16; + else if (val > 4) + return 8; + else if (val > 2) + return 4; + else if (val > 1) + return 2; + else + return 1; +} + +template<typename IndexT> +__global__ void compressed_matrix_gemm_stage_1( + const IndexT * A_row_indices, + const IndexT * A_col_indices, + IndexT A_size1, + const IndexT * B_row_indices, + IndexT *subwarpsize_per_group, + IndexT *max_nnz_row_A_per_group, + IndexT *max_nnz_row_B_per_group) +{ + unsigned int subwarpsize_in_thread = 0; + unsigned int max_nnz_row_A = 0; + unsigned int max_nnz_row_B = 0; + + unsigned int rows_per_group = (A_size1 - 1) / gridDim.x + 1; + unsigned int row_per_group_end = min(A_size1, rows_per_group * (blockIdx.x + 1)); + + for (unsigned int row = rows_per_group * blockIdx.x + threadIdx.x; row < row_per_group_end; row += blockDim.x) + { + unsigned int A_row_start = A_row_indices[row]; + unsigned int A_row_end = A_row_indices[row+1]; + unsigned int row_num = A_row_end - A_row_start; + if (row_num > 32) // too many rows in B need to be merged for a single pass + { + unsigned int subwarp_sqrt = (unsigned int)sqrt(double(row_num)) + 1; + subwarpsize_in_thread = max(subwarp_sqrt, subwarpsize_in_thread); + } + else + subwarpsize_in_thread = max(A_row_end - A_row_start, subwarpsize_in_thread); + max_nnz_row_A = max(max_nnz_row_A, row_num); + for (unsigned int j = A_row_start; j < A_row_end; ++j) + { + unsigned int col = A_col_indices[j]; + unsigned int row_len_B = B_row_indices[col + 1] - B_row_indices[col]; + max_nnz_row_B = max(row_len_B, max_nnz_row_B); + } + } + + // reduction to obtain maximum in thread block + __shared__ unsigned int shared_subwarpsize[256]; + __shared__ unsigned int shared_max_nnz_row_A[256]; + __shared__ unsigned int shared_max_nnz_row_B[256]; + + shared_subwarpsize[threadIdx.x] = subwarpsize_in_thread; + shared_max_nnz_row_A[threadIdx.x] = max_nnz_row_A; + shared_max_nnz_row_B[threadIdx.x] = max_nnz_row_B; + for (unsigned int stride = blockDim.x/2; stride > 0; stride /= 2) + { + __syncthreads(); + if (threadIdx.x < stride) + { + shared_subwarpsize[threadIdx.x] = max( shared_subwarpsize[threadIdx.x], shared_subwarpsize[threadIdx.x + stride]); + shared_max_nnz_row_A[threadIdx.x] = max(shared_max_nnz_row_A[threadIdx.x], shared_max_nnz_row_A[threadIdx.x + stride]); + shared_max_nnz_row_B[threadIdx.x] = max(shared_max_nnz_row_B[threadIdx.x], shared_max_nnz_row_B[threadIdx.x + stride]); + } + } + + if (threadIdx.x == 0) + { + subwarpsize_per_group[blockIdx.x] = round_to_next_power_of_2(shared_subwarpsize[0]); + max_nnz_row_A_per_group[blockIdx.x] = shared_max_nnz_row_A[0]; + max_nnz_row_B_per_group[blockIdx.x] = shared_max_nnz_row_B[0]; + } +} + +// +// Stage 2: Determine sparsity pattern of C +// +inline __device__ unsigned int merge_subwarp_symbolic(unsigned int row_B_start, unsigned int row_B_end, unsigned int const *B_col_indices, unsigned int B_size2, unsigned int subwarpsize) +{ + unsigned int current_front_index = (row_B_start < row_B_end) ? load_and_cache(B_col_indices + row_B_start) : B_size2; + + unsigned int num_nnz = 0; + while (1) + { + // determine current minimum (warp shuffle) + unsigned int min_index = current_front_index; + for (unsigned int i = subwarpsize/2; i >= 1; i /= 2) + min_index = min(min_index, __shfl_xor((int)min_index, (int)i)); + + if (min_index == B_size2) + break; + + // update front: + current_front_index = (current_front_index == min_index) ? ((++row_B_start < row_B_end) ? load_and_cache(B_col_indices + row_B_start) : B_size2) + : current_front_index; + ++num_nnz; + } + + return num_nnz; +} + +inline __device__ unsigned int merge_subwarp_symbolic_double(unsigned int row_B_start, unsigned int row_B_end, unsigned int const *B_col_indices, unsigned int B_size2, + unsigned int *output_array, unsigned int id_in_warp, unsigned int subwarpsize) +{ + unsigned int current_front_index = (row_B_start < row_B_end) ? load_and_cache(B_col_indices + row_B_start) : B_size2; + + unsigned int num_nnz = 0; + unsigned int index_buffer = 0; + unsigned int buffer_size = 0; + while (1) + { + // determine current minimum (warp shuffle) + unsigned int min_index = current_front_index; + for (unsigned int i = subwarpsize/2; i >= 1; i /= 2) + min_index = min(min_index, __shfl_xor((int)min_index, (int)i)); + + if (min_index == B_size2) + break; + + // update front: + current_front_index = (current_front_index == min_index) ? ((++row_B_start < row_B_end) ? load_and_cache(B_col_indices + row_B_start) : B_size2) + : current_front_index; + + // write output + index_buffer = (id_in_warp == buffer_size) ? min_index : index_buffer; + ++buffer_size; + + if (buffer_size == subwarpsize) // register buffer full? + { + output_array[id_in_warp] = index_buffer; + output_array += subwarpsize; + buffer_size = 0; + } + + ++num_nnz; + } + + // write remaining entries from register buffer: + if (id_in_warp < buffer_size) + output_array[id_in_warp] = index_buffer; + + return num_nnz; +} + +template<typename IndexT> +__global__ void compressed_matrix_gemm_stage_2( + const IndexT * A_row_indices, + const IndexT * A_col_indices, + IndexT A_size1, + const IndexT * B_row_indices, + const IndexT * B_col_indices, + IndexT B_size2, + IndexT * C_row_indices, + unsigned int *subwarpsize_array, + unsigned int *max_row_size_A, + unsigned int *max_row_size_B, + unsigned int *scratchpad_offsets, + unsigned int *scratchpad_indices) +{ + unsigned int subwarpsize = subwarpsize_array[blockIdx.x]; + + unsigned int num_warps = blockDim.x / subwarpsize; + unsigned int warp_id = threadIdx.x / subwarpsize; + unsigned int id_in_warp = threadIdx.x % subwarpsize; + + unsigned int scratchpad_rowlength = max_row_size_B[blockIdx.x] * subwarpsize; + unsigned int scratchpad_rows_per_warp = max_row_size_A[blockIdx.x] / subwarpsize + 1; + unsigned int *subwarp_scratchpad_start = scratchpad_indices + scratchpad_offsets[blockIdx.x] + warp_id * scratchpad_rows_per_warp * scratchpad_rowlength; + + unsigned int rows_per_group = (A_size1 - 1) / gridDim.x + 1; + unsigned int row_per_group_end = min(A_size1, rows_per_group * (blockIdx.x + 1)); + + for (unsigned int row = rows_per_group * blockIdx.x + warp_id; row < row_per_group_end; row += num_warps) + { + unsigned int row_A_start = A_row_indices[row]; + unsigned int row_A_end = A_row_indices[row+1]; + + if (row_A_end - row_A_start > subwarpsize) + { + unsigned int final_merge_start = 0; + unsigned int final_merge_end = 0; + + // merge to temporary scratchpad memory: + unsigned int *subwarp_scratchpad = subwarp_scratchpad_start; + unsigned int iter = 0; + while (row_A_end > row_A_start) + { + unsigned int my_row_B = row_A_start + id_in_warp; + unsigned int row_B_index = (my_row_B < row_A_end) ? A_col_indices[my_row_B] : 0; + unsigned int row_B_start = (my_row_B < row_A_end) ? load_and_cache(B_row_indices + row_B_index) : 0; + unsigned int row_B_end = (my_row_B < row_A_end) ? load_and_cache(B_row_indices + row_B_index + 1) : 0; + + unsigned int nnz_in_merge = merge_subwarp_symbolic_double(row_B_start, row_B_end, B_col_indices, B_size2, + subwarp_scratchpad, id_in_warp, subwarpsize); + + final_merge_start = (iter == id_in_warp) ? subwarp_scratchpad - scratchpad_indices : final_merge_start; + final_merge_end = (iter == id_in_warp) ? final_merge_start + nnz_in_merge : final_merge_end; + ++iter; + + row_A_start += subwarpsize; + subwarp_scratchpad += scratchpad_rowlength; // write to next row in scratchpad + } + + // final merge: + unsigned int num_nnz = merge_subwarp_symbolic(final_merge_start, final_merge_end, scratchpad_indices, B_size2, subwarpsize); + + if (id_in_warp == 0) + C_row_indices[row] = num_nnz; + } + else + { + // single merge + unsigned int my_row_B = row_A_start + id_in_warp; + unsigned int row_B_index = (my_row_B < row_A_end) ? A_col_indices[my_row_B] : 0; + unsigned int row_B_start = (my_row_B < row_A_end) ? load_and_cache(B_row_indices + row_B_index) : 0; + unsigned int row_B_end = (my_row_B < row_A_end) ? load_and_cache(B_row_indices + row_B_index + 1) : 0; + + unsigned int num_nnz = merge_subwarp_symbolic(row_B_start, row_B_end, B_col_indices, B_size2, subwarpsize); + + if (id_in_warp == 0) + C_row_indices[row] = num_nnz; + } + } + +} + + +// +// Stage 3: Fill C with values +// +template<typename NumericT> +__device__ unsigned int merge_subwarp_numeric(NumericT scaling_factor, + unsigned int input_start, unsigned int input_end, const unsigned int *input_indices, const NumericT *input_values, unsigned int invalid_token, + unsigned int *output_indices, NumericT *output_values, + unsigned int id_in_warp, unsigned int subwarpsize) +{ + unsigned int current_front_index = (input_start < input_end) ? load_and_cache(input_indices + input_start) : invalid_token; + NumericT current_front_value = (input_start < input_end) ? load_and_cache(input_values + input_start) : 0; + + unsigned int index_buffer = 0; + NumericT value_buffer = 0; + unsigned int buffer_size = 0; + unsigned int nnz_written = 0; + while (1) + { + // determine current minimum: + unsigned int min_index = current_front_index; + for (unsigned int i = subwarpsize/2; i >= 1; i /= 2) + min_index = min(min_index, __shfl_xor((int)min_index, (int)i)); + + if (min_index == invalid_token) // done + break; + + // compute entry in C: + NumericT output_value = (current_front_index == min_index) ? scaling_factor * current_front_value : 0; + for (unsigned int i = subwarpsize/2; i >= 1; i /= 2) + output_value += __shfl_xor((int)output_value, (int)i); + + // update front: + if (current_front_index == min_index) + { + ++input_start; + current_front_index = (input_start < input_end) ? load_and_cache(input_indices + input_start) : invalid_token; + current_front_value = (input_start < input_end) ? load_and_cache(input_values + input_start) : 0; + } + + // write current front to register buffer: + index_buffer = (id_in_warp == buffer_size) ? min_index : index_buffer; + value_buffer = (id_in_warp == buffer_size) ? output_value : value_buffer; + ++buffer_size; + + // flush register buffer via a coalesced write once full: + if (buffer_size == subwarpsize) + { + output_indices[id_in_warp] = index_buffer; output_indices += subwarpsize; + output_values[id_in_warp] = value_buffer; output_values += subwarpsize; + buffer_size = 0; + } + + ++nnz_written; + } + + // write remaining entries in register buffer to C: + if (id_in_warp < buffer_size) + { + output_indices[id_in_warp] = index_buffer; + output_values[id_in_warp] = value_buffer; + } + + return nnz_written; +} + +template<typename IndexT, typename NumericT> +__global__ void compressed_matrix_gemm_stage_3( + const IndexT * A_row_indices, + const IndexT * A_col_indices, + const NumericT * A_elements, + IndexT A_size1, + const IndexT * B_row_indices, + const IndexT * B_col_indices, + const NumericT * B_elements, + IndexT B_size2, + IndexT const * C_row_indices, + IndexT * C_col_indices, + NumericT * C_elements, + unsigned int *subwarpsize_array, + unsigned int *max_row_size_A, + unsigned int *max_row_size_B, + unsigned int *scratchpad_offsets, + unsigned int *scratchpad_indices, + NumericT *scratchpad_values) +{ + unsigned int subwarpsize = subwarpsize_array[blockIdx.x]; + + unsigned int num_warps = blockDim.x / subwarpsize; + unsigned int warp_id = threadIdx.x / subwarpsize; + unsigned int id_in_warp = threadIdx.x % subwarpsize; + + unsigned int scratchpad_rowlength = max_row_size_B[blockIdx.x] * subwarpsize; + unsigned int scratchpad_rows_per_warp = max_row_size_A[blockIdx.x] / subwarpsize + 1; + unsigned int subwarp_scratchpad_shift = scratchpad_offsets[blockIdx.x] + warp_id * scratchpad_rows_per_warp * scratchpad_rowlength; + + unsigned int rows_per_group = (A_size1 - 1) / gridDim.x + 1; + unsigned int row_per_group_end = min(A_size1, rows_per_group * (blockIdx.x + 1)); + + for (unsigned int row = rows_per_group * blockIdx.x + warp_id; row < row_per_group_end; row += num_warps) + { + unsigned int row_A_start = A_row_indices[row]; + unsigned int row_A_end = A_row_indices[row+1]; + + if (row_A_end - row_A_start > subwarpsize) + { + // first merge stage: + unsigned int final_merge_start = 0; + unsigned int final_merge_end = 0; + unsigned int iter = 0; + unsigned int *scratchpad_indices_ptr = scratchpad_indices + subwarp_scratchpad_shift; + NumericT *scratchpad_values_ptr = scratchpad_values + subwarp_scratchpad_shift; + + while (row_A_start < row_A_end) + { + unsigned int my_row_B = row_A_start + id_in_warp; + unsigned int row_B_index = (my_row_B < row_A_end) ? A_col_indices[my_row_B] : 0; + unsigned int row_B_start = (my_row_B < row_A_end) ? load_and_cache(B_row_indices + row_B_index) : 0; + unsigned int row_B_end = (my_row_B < row_A_end) ? load_and_cache(B_row_indices + row_B_index + 1) : 0; + NumericT val_A = (my_row_B < row_A_end) ? A_elements[my_row_B] : 0; + + unsigned int nnz_written = merge_subwarp_numeric(val_A, + row_B_start, row_B_end, B_col_indices, B_elements, B_size2, + scratchpad_indices_ptr, scratchpad_values_ptr, + id_in_warp, subwarpsize); + + if (iter == id_in_warp) + { + final_merge_start = scratchpad_indices_ptr - scratchpad_indices; + final_merge_end = final_merge_start + nnz_written; + } + ++iter; + + row_A_start += subwarpsize; + scratchpad_indices_ptr += scratchpad_rowlength; + scratchpad_values_ptr += scratchpad_rowlength; + } + + // second merge stage: + unsigned int index_in_C = C_row_indices[row]; + merge_subwarp_numeric(NumericT(1), + final_merge_start, final_merge_end, scratchpad_indices, scratchpad_values, B_size2, + C_col_indices + index_in_C, C_elements + index_in_C, + id_in_warp, subwarpsize); + } + else + { + unsigned int my_row_B = row_A_start + id_in_warp; + unsigned int row_B_index = (my_row_B < row_A_end) ? A_col_indices[my_row_B] : 0; + unsigned int row_B_start = (my_row_B < row_A_end) ? load_and_cache(B_row_indices + row_B_index) : 0; + unsigned int row_B_end = (my_row_B < row_A_end) ? load_and_cache(B_row_indices + row_B_index + 1) : 0; + NumericT val_A = (my_row_B < row_A_end) ? A_elements[my_row_B] : 0; + + unsigned int index_in_C = C_row_indices[row]; + + merge_subwarp_numeric(val_A, + row_B_start, row_B_end, B_col_indices, B_elements, B_size2, + C_col_indices + index_in_C, C_elements + index_in_C, + id_in_warp, subwarpsize); + } + } + +} + + + + +// +// Decomposition kernels: +// +template<typename IndexT> +__global__ void compressed_matrix_gemm_decompose_1( + const IndexT * A_row_indices, + IndexT A_size1, + IndexT max_per_row, + IndexT *chunks_per_row) +{ + for (IndexT i = blockIdx.x * blockDim.x + threadIdx.x; i < A_size1; i += blockDim.x * gridDim.x) + { + IndexT num_entries = A_row_indices[i+1] - A_row_indices[i]; + chunks_per_row[i] = (num_entries < max_per_row) ? 1 : ((num_entries - 1)/ max_per_row + 1); + } +} + + +template<typename IndexT, typename NumericT> +__global__ void compressed_matrix_gemm_A2( + IndexT * A2_row_indices, + IndexT * A2_col_indices, + NumericT * A2_elements, + IndexT A2_size1, + IndexT *new_row_buffer) +{ + for (IndexT i = blockIdx.x * blockDim.x + threadIdx.x; i < A2_size1; i += blockDim.x * gridDim.x) + { + unsigned int index_start = new_row_buffer[i]; + unsigned int index_stop = new_row_buffer[i+1]; + + A2_row_indices[i] = index_start; + + for (IndexT j = index_start; j < index_stop; ++j) + { + A2_col_indices[j] = j; + A2_elements[j] = NumericT(1); + } + } + + // write last entry in row_buffer with global thread 0: + if (threadIdx.x == 0 && blockIdx.x == 0) + A2_row_indices[A2_size1] = new_row_buffer[A2_size1]; +} + +template<typename IndexT, typename NumericT> +__global__ void compressed_matrix_gemm_G1( + IndexT * G1_row_indices, + IndexT * G1_col_indices, + NumericT * G1_elements, + IndexT G1_size1, + IndexT const *A_row_indices, + IndexT const *A_col_indices, + NumericT const *A_elements, + IndexT A_size1, + IndexT A_nnz, + IndexT max_per_row, + IndexT *new_row_buffer) +{ + // Part 1: Copy column indices and entries: + for (IndexT i = blockIdx.x * blockDim.x + threadIdx.x; i < A_nnz; i += blockDim.x * gridDim.x) + { + G1_col_indices[i] = A_col_indices[i]; + G1_elements[i] = A_elements[i]; + } + + // Part 2: Derive new row indicies: + for (IndexT i = blockIdx.x * blockDim.x + threadIdx.x; i < A_size1; i += blockDim.x * gridDim.x) + { + unsigned int old_start = A_row_indices[i]; + unsigned int new_start = new_row_buffer[i]; + unsigned int row_chunks = new_row_buffer[i+1] - new_start; + + for (IndexT j=0; j<row_chunks; ++j) + G1_row_indices[new_start + j] = old_start + j * max_per_row; + } + + // write last entry in row_buffer with global thread 0: + if (threadIdx.x == 0 && blockIdx.x == 0) + G1_row_indices[G1_size1] = A_row_indices[A_size1]; +} + + + +/** @brief Carries out sparse_matrix-sparse_matrix multiplication for CSR matrices +* +* Implementation of the convenience expression C = prod(A, B); +* Based on computing C(i, :) = A(i, :) * B via merging the respective rows of B +* +* @param A Left factor +* @param B Right factor +* @param C Result matrix +*/ +template<class NumericT, unsigned int AlignmentV> +void prod_impl(viennacl::compressed_matrix<NumericT, AlignmentV> const & A, + viennacl::compressed_matrix<NumericT, AlignmentV> const & B, + viennacl::compressed_matrix<NumericT, AlignmentV> & C) +{ + C.resize(A.size1(), B.size2(), false); + + unsigned int blocknum = 256; + unsigned int threadnum = 128; + + viennacl::vector<unsigned int> subwarp_sizes(blocknum, viennacl::traits::context(A)); // upper bound for the nonzeros per row encountered for each work group + viennacl::vector<unsigned int> max_nnz_row_A(blocknum, viennacl::traits::context(A)); // upper bound for the nonzeros per row encountered for each work group + viennacl::vector<unsigned int> max_nnz_row_B(blocknum, viennacl::traits::context(A)); // upper bound for the nonzeros per row encountered for each work group + +#ifdef VIENNACL_WITH_SPGEMM_CUDA_TIMINGS + viennacl::tools::timer timer; +#endif + + // + // Stage 1: Determine upper bound for number of nonzeros + // +#ifdef VIENNACL_WITH_SPGEMM_CUDA_TIMINGS + cudaDeviceSynchronize(); + timer.start(); +#endif + + compressed_matrix_gemm_stage_1<<<blocknum, threadnum>>>(viennacl::cuda_arg<unsigned int>(A.handle1()), + viennacl::cuda_arg<unsigned int>(A.handle2()), + static_cast<unsigned int>(A.size1()), + viennacl::cuda_arg<unsigned int>(B.handle1()), + viennacl::cuda_arg(subwarp_sizes), + viennacl::cuda_arg(max_nnz_row_A), + viennacl::cuda_arg(max_nnz_row_B) + ); + VIENNACL_CUDA_LAST_ERROR_CHECK("compressed_matrix_gemm_stage_1"); +#ifdef VIENNACL_WITH_SPGEMM_CUDA_TIMINGS + cudaDeviceSynchronize(); + std::cout << "Stage 1 device: " << timer.get() << std::endl; + timer.start(); +#endif + + subwarp_sizes.switch_memory_context(viennacl::context(MAIN_MEMORY)); + unsigned int * subwarp_sizes_ptr = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(subwarp_sizes.handle()); + + max_nnz_row_A.switch_memory_context(viennacl::context(MAIN_MEMORY)); + unsigned int const * max_nnz_row_A_ptr = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(max_nnz_row_A.handle()); + + max_nnz_row_B.switch_memory_context(viennacl::context(MAIN_MEMORY)); + unsigned int const * max_nnz_row_B_ptr = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(max_nnz_row_B.handle()); + + //std::cout << "Subwarp sizes: " << subwarp_sizes << std::endl; + + viennacl::vector<unsigned int> scratchpad_offsets(blocknum, viennacl::context(MAIN_MEMORY)); // upper bound for the nonzeros per row encountered for each work group + unsigned int * scratchpad_offsets_ptr = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(scratchpad_offsets.handle()); + + unsigned int max_subwarp_size = 0; + unsigned int A_max_nnz_per_row = 0; + unsigned int scratchpad_offset = 0; + //std::cout << "Scratchpad offsets: " << std::endl; + for (std::size_t i=0; i<subwarp_sizes.size(); ++i) + { + max_subwarp_size = std::max(max_subwarp_size, subwarp_sizes_ptr[i]); + A_max_nnz_per_row = std::max(A_max_nnz_per_row, max_nnz_row_A_ptr[i]); + + scratchpad_offsets_ptr[i] = scratchpad_offset; + //std::cout << scratchpad_offset << " (with " << (max_nnz_row_A_ptr[i] / subwarp_sizes_ptr[i] + 1) << " warp reloads per group at " << max_nnz_row_A_ptr[i] << " max rows, " + // << upper_bound_nonzeros_per_row_C_ptr[i] << " row length, " + // << (256 / subwarp_sizes_ptr[i]) << " warps per group " << std::endl; + unsigned int max_warp_reloads = max_nnz_row_A_ptr[i] / subwarp_sizes_ptr[i] + 1; + unsigned int max_row_length_after_warp_merge = subwarp_sizes_ptr[i] * max_nnz_row_B_ptr[i]; + unsigned int warps_in_group = threadnum / subwarp_sizes_ptr[i]; + scratchpad_offset += max_warp_reloads + * max_row_length_after_warp_merge + * warps_in_group; + } + //std::cout << "Scratchpad memory for indices: " << scratchpad_offset << " entries (" << scratchpad_offset * sizeof(unsigned int) * 1e-6 << " MB)" << std::endl; + + if (max_subwarp_size > 32) + { + // determine augmented size: + unsigned int max_entries_in_G = 1024; + if (A_max_nnz_per_row <= 512*512) + max_entries_in_G = 512; + if (A_max_nnz_per_row <= 256*256) + max_entries_in_G = 256; + if (A_max_nnz_per_row <= 128*128) + max_entries_in_G = 128; + if (A_max_nnz_per_row <= 64*64) + max_entries_in_G = 64; + + viennacl::vector<unsigned int> exclusive_scan_helper(A.size1() + 1, viennacl::traits::context(A)); + compressed_matrix_gemm_decompose_1<<<blocknum, threadnum>>>(viennacl::cuda_arg<unsigned int>(A.handle1()), + static_cast<unsigned int>(A.size1()), + static_cast<unsigned int>(max_entries_in_G), + viennacl::cuda_arg(exclusive_scan_helper) + ); + VIENNACL_CUDA_LAST_ERROR_CHECK("compressed_matrix_gemm_decompose_1"); + + thrust::exclusive_scan(thrust::device_ptr<unsigned int>(viennacl::cuda_arg(exclusive_scan_helper)), + thrust::device_ptr<unsigned int>(viennacl::cuda_arg(exclusive_scan_helper) + exclusive_scan_helper.size()), + thrust::device_ptr<unsigned int>(viennacl::cuda_arg(exclusive_scan_helper))); + + unsigned int augmented_size = exclusive_scan_helper[A.size1()]; + + // split A = A2 * G1 + viennacl::compressed_matrix<NumericT, AlignmentV> A2(A.size1(), augmented_size, augmented_size, viennacl::traits::context(A)); + viennacl::compressed_matrix<NumericT, AlignmentV> G1(augmented_size, A.size2(), A.nnz(), viennacl::traits::context(A)); + + // fill A2: + compressed_matrix_gemm_A2<<<blocknum, threadnum>>>(viennacl::cuda_arg<unsigned int>(A2.handle1()), + viennacl::cuda_arg<unsigned int>(A2.handle2()), + viennacl::cuda_arg<NumericT>(A2.handle()), + static_cast<unsigned int>(A2.size1()), + viennacl::cuda_arg(exclusive_scan_helper) + ); + VIENNACL_CUDA_LAST_ERROR_CHECK("compressed_matrix_gemm_A2"); + + // fill G1: + compressed_matrix_gemm_G1<<<blocknum, threadnum>>>(viennacl::cuda_arg<unsigned int>(G1.handle1()), + viennacl::cuda_arg<unsigned int>(G1.handle2()), + viennacl::cuda_arg<NumericT>(G1.handle()), + static_cast<unsigned int>(G1.size1()), + viennacl::cuda_arg<unsigned int>(A.handle1()), + viennacl::cuda_arg<unsigned int>(A.handle2()), + viennacl::cuda_arg<NumericT>(A.handle()), + static_cast<unsigned int>(A.size1()), + static_cast<unsigned int>(A.nnz()), + static_cast<unsigned int>(max_entries_in_G), + viennacl::cuda_arg(exclusive_scan_helper) + ); + VIENNACL_CUDA_LAST_ERROR_CHECK("compressed_matrix_gemm_G1"); + + // compute tmp = G1 * B; + // C = A2 * tmp; + viennacl::compressed_matrix<NumericT, AlignmentV> tmp(G1.size1(), B.size2(), 0, viennacl::traits::context(A)); + prod_impl(G1, B, tmp); // this runs a standard RMerge without decomposition of G1 + prod_impl(A2, tmp, C); // this may split A2 again + return; + } + + subwarp_sizes.switch_memory_context(viennacl::traits::context(A)); + max_nnz_row_A.switch_memory_context(viennacl::traits::context(A)); + max_nnz_row_B.switch_memory_context(viennacl::traits::context(A)); + scratchpad_offsets.switch_memory_context(viennacl::traits::context(A)); + + viennacl::vector<unsigned int> scratchpad_indices(scratchpad_offset, viennacl::traits::context(A)); // upper bound for the nonzeros per row encountered for each work group + +#ifdef VIENNACL_WITH_SPGEMM_CUDA_TIMINGS + std::cout << "Intermediate host stage: " << timer.get() << std::endl; + timer.start(); +#endif + + // + // Stage 2: Determine pattern of C + // + + compressed_matrix_gemm_stage_2<<<blocknum, threadnum>>>(viennacl::cuda_arg<unsigned int>(A.handle1()), + viennacl::cuda_arg<unsigned int>(A.handle2()), + static_cast<unsigned int>(A.size1()), + viennacl::cuda_arg<unsigned int>(B.handle1()), + viennacl::cuda_arg<unsigned int>(B.handle2()), + static_cast<unsigned int>(B.size2()), + viennacl::cuda_arg<unsigned int>(C.handle1()), + viennacl::cuda_arg(subwarp_sizes), + viennacl::cuda_arg(max_nnz_row_A), + viennacl::cuda_arg(max_nnz_row_B), + viennacl::cuda_arg(scratchpad_offsets), + viennacl::cuda_arg(scratchpad_indices) + ); + VIENNACL_CUDA_LAST_ERROR_CHECK("compressed_matrix_gemm_stage_2"); +#ifdef VIENNACL_WITH_SPGEMM_CUDA_TIMINGS + cudaDeviceSynchronize(); + std::cout << "Stage 2: " << timer.get() << std::endl; + timer.start(); +#endif + + + // exclusive scan on C.handle1(), ultimately allowing to allocate remaining memory for C + viennacl::backend::typesafe_host_array<unsigned int> row_buffer(C.handle1(), C.size1() + 1); + viennacl::backend::memory_read(C.handle1(), 0, row_buffer.raw_size(), row_buffer.get()); + unsigned int current_offset = 0; + for (std::size_t i=0; i<C.size1(); ++i) + { + unsigned int tmp = row_buffer[i]; + row_buffer.set(i, current_offset); + current_offset += tmp; + } + row_buffer.set(C.size1(), current_offset); + viennacl::backend::memory_write(C.handle1(), 0, row_buffer.raw_size(), row_buffer.get()); + + + // + // Stage 3: Compute entries in C + // + C.reserve(current_offset, false); + + viennacl::vector<NumericT> scratchpad_values(scratchpad_offset, viennacl::traits::context(A)); // upper bound for the nonzeros per row encountered for each work group + +#ifdef VIENNACL_WITH_SPGEMM_CUDA_TIMINGS + std::cout << "Intermediate stage 2->3: " << timer.get() << std::endl; + timer.start(); +#endif + + compressed_matrix_gemm_stage_3<<<blocknum, threadnum>>>(viennacl::cuda_arg<unsigned int>(A.handle1()), + viennacl::cuda_arg<unsigned int>(A.handle2()), + viennacl::cuda_arg<NumericT>(A.handle()), + static_cast<unsigned int>(A.size1()), + viennacl::cuda_arg<unsigned int>(B.handle1()), + viennacl::cuda_arg<unsigned int>(B.handle2()), + viennacl::cuda_arg<NumericT>(B.handle()), + static_cast<unsigned int>(B.size2()), + viennacl::cuda_arg<unsigned int>(C.handle1()), + viennacl::cuda_arg<unsigned int>(C.handle2()), + viennacl::cuda_arg<NumericT>(C.handle()), + viennacl::cuda_arg(subwarp_sizes), + viennacl::cuda_arg(max_nnz_row_A), + viennacl::cuda_arg(max_nnz_row_B), + viennacl::cuda_arg(scratchpad_offsets), + viennacl::cuda_arg(scratchpad_indices), + viennacl::cuda_arg(scratchpad_values) + ); + VIENNACL_CUDA_LAST_ERROR_CHECK("compressed_matrix_gemm_stage_3"); +#ifdef VIENNACL_WITH_SPGEMM_CUDA_TIMINGS + cudaDeviceSynchronize(); + std::cout << "Stage 3: " << timer.get() << std::endl; + std::cout << "----------" << std::endl; +#endif + +} + +} // namespace cuda +} //namespace linalg +} //namespace viennacl + + +#endif
