http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/amg_operations.hpp ---------------------------------------------------------------------- diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/amg_operations.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/amg_operations.hpp new file mode 100644 index 0000000..65b323e --- /dev/null +++ b/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/amg_operations.hpp @@ -0,0 +1,821 @@ +#ifndef VIENNACL_LINALG_CUDA_AMG_OPERATIONS_HPP +#define VIENNACL_LINALG_CUDA_AMG_OPERATIONS_HPP + +/* ========================================================================= + Copyright (c) 2010-2016, Institute for Microelectronics, + Institute for Analysis and Scientific Computing, + TU Wien. + Portions of this software are copyright by UChicago Argonne, LLC. + + ----------------- + ViennaCL - The Vienna Computing Library + ----------------- + + Project Head: Karl Rupp [email protected] + + (A list of authors and contributors can be found in the PDF manual) + + License: MIT (X11), see file LICENSE in the base directory +============================================================================= */ + +/** @file cuda/amg_operations.hpp + @brief Implementations of routines for AMG in OpenCL. +*/ + +#include <cstdlib> +#include <cmath> +#include "viennacl/linalg/detail/amg/amg_base.hpp" + +#include <map> +#include <set> + +namespace viennacl +{ +namespace linalg +{ +namespace cuda +{ +namespace amg +{ + + +/////////////////////////////////////////// + +__global__ void amg_influence_trivial_kernel( + const unsigned int * row_indices, + const unsigned int * column_indices, + unsigned int size1, + unsigned int nnz, + unsigned int *influences_row, + unsigned int *influences_id, + unsigned int *influences_values + ) +{ + unsigned int global_id = blockDim.x * blockIdx.x + threadIdx.x; + unsigned int global_size = gridDim.x * blockDim.x; + + for (unsigned int i = global_id; i < size1; i += global_size) + { + unsigned int tmp = row_indices[i]; + influences_row[i] = tmp; + influences_values[i] = row_indices[i+1] - tmp; + } + + for (unsigned int i = global_id; i < nnz; i += global_size) + influences_id[i] = column_indices[i]; + + if (global_id == 0) + influences_row[size1] = row_indices[size1]; +} + + +/** @brief Routine for taking all connections in the matrix as strong */ +template<typename NumericT> +void amg_influence_trivial(compressed_matrix<NumericT> const & A, + viennacl::linalg::detail::amg::amg_level_context & amg_context, + viennacl::linalg::amg_tag & tag) +{ + (void)tag; + + amg_influence_trivial_kernel<<<128, 128>>>(viennacl::cuda_arg<unsigned int>(A.handle1().cuda_handle()), + viennacl::cuda_arg<unsigned int>(A.handle2().cuda_handle()), + static_cast<unsigned int>(A.size1()), + static_cast<unsigned int>(A.nnz()), + viennacl::cuda_arg(amg_context.influence_jumper_), + viennacl::cuda_arg(amg_context.influence_ids_), + viennacl::cuda_arg(amg_context.influence_values_) + ); + VIENNACL_CUDA_LAST_ERROR_CHECK("amg_influence_trivial_kernel"); +} + + +/** @brief Routine for extracting strongly connected points considering a user-provided threshold value */ +template<typename NumericT> +void amg_influence_advanced(compressed_matrix<NumericT> const & A, + viennacl::linalg::detail::amg::amg_level_context & amg_context, + viennacl::linalg::amg_tag & tag) +{ + throw std::runtime_error("not implemented yet"); +} + +/** @brief Dispatcher for influence processing */ +template<typename NumericT> +void amg_influence(compressed_matrix<NumericT> const & A, + viennacl::linalg::detail::amg::amg_level_context & amg_context, + viennacl::linalg::amg_tag & tag) +{ + // TODO: dispatch based on influence tolerance provided + amg_influence_trivial(A, amg_context, tag); +} + +/** @brief Assign IDs to coarse points. +* +* TODO: Use exclusive_scan on GPU for this. +*/ +inline void enumerate_coarse_points(viennacl::linalg::detail::amg::amg_level_context & amg_context) +{ + viennacl::backend::typesafe_host_array<unsigned int> point_types(amg_context.point_types_.handle(), amg_context.point_types_.size()); + viennacl::backend::typesafe_host_array<unsigned int> coarse_ids(amg_context.coarse_id_.handle(), amg_context.coarse_id_.size()); + viennacl::backend::memory_read(amg_context.point_types_.handle(), 0, point_types.raw_size(), point_types.get()); + viennacl::backend::memory_read(amg_context.coarse_id_.handle(), 0, coarse_ids.raw_size(), coarse_ids.get()); + + unsigned int coarse_id = 0; + for (std::size_t i=0; i<amg_context.point_types_.size(); ++i) + { + coarse_ids.set(i, coarse_id); + if (point_types[i] == viennacl::linalg::detail::amg::amg_level_context::POINT_TYPE_COARSE) + ++coarse_id; + } + + amg_context.num_coarse_ = coarse_id; + + viennacl::backend::memory_write(amg_context.coarse_id_.handle(), 0, coarse_ids.raw_size(), coarse_ids.get()); +} + +////////////////////////////////////// + +/** @brief CUDA kernel initializing the work vectors at each PMIS iteration */ +template<typename IndexT> +__global__ void amg_pmis2_init_workdata(IndexT *work_state, + IndexT *work_random, + IndexT *work_index, + IndexT const *point_types, + IndexT const *random_weights, + unsigned int size) +{ + unsigned int global_id = blockDim.x * blockIdx.x + threadIdx.x; + unsigned int global_size = gridDim.x * blockDim.x; + + for (unsigned int i = global_id; i < size; i += global_size) + { + switch (point_types[i]) + { + case viennacl::linalg::detail::amg::amg_level_context::POINT_TYPE_UNDECIDED: work_state[i] = 1; break; + case viennacl::linalg::detail::amg::amg_level_context::POINT_TYPE_FINE: work_state[i] = 0; break; + case viennacl::linalg::detail::amg::amg_level_context::POINT_TYPE_COARSE: work_state[i] = 2; break; + default: + break; // do nothing + } + + work_random[i] = random_weights[i]; + work_index[i] = i; + } +} + +/** @brief CUDA kernel propagating the state triple (status, weight, nodeID) to neighbors using a max()-operation */ +template<typename IndexT> +__global__ void amg_pmis2_max_neighborhood(IndexT const *work_state, + IndexT const *work_random, + IndexT const *work_index, + IndexT *work_state2, + IndexT *work_random2, + IndexT *work_index2, + IndexT const *influences_row, + IndexT const *influences_id, + unsigned int size) +{ + unsigned int global_id = blockDim.x * blockIdx.x + threadIdx.x; + unsigned int global_size = gridDim.x * blockDim.x; + + for (unsigned int i = global_id; i < size; i += global_size) + { + // load + unsigned int state = work_state[i]; + unsigned int random = work_random[i]; + unsigned int index = work_index[i]; + + // max + unsigned int j_stop = influences_row[i + 1]; + for (unsigned int j = influences_row[i]; j < j_stop; ++j) + { + unsigned int influenced_point_id = influences_id[j]; + + // lexigraphical triple-max (not particularly pretty, but does the job): + if (state < work_state[influenced_point_id]) + { + state = work_state[influenced_point_id]; + random = work_random[influenced_point_id]; + index = work_index[influenced_point_id]; + } + else if (state == work_state[influenced_point_id]) + { + if (random < work_random[influenced_point_id]) + { + state = work_state[influenced_point_id]; + random = work_random[influenced_point_id]; + index = work_index[influenced_point_id]; + } + else if (random == work_random[influenced_point_id]) + { + if (index < work_index[influenced_point_id]) + { + state = work_state[influenced_point_id]; + random = work_random[influenced_point_id]; + index = work_index[influenced_point_id]; + } + } // max(random) + } // max(state) + } // for + + // store + work_state2[i] = state; + work_random2[i] = random; + work_index2[i] = index; + } +} + +/** @brief CUDA kernel for marking MIS and non-MIS nodes */ +template<typename IndexT> +__global__ void amg_pmis2_mark_mis_nodes(IndexT const *work_state, + IndexT const *work_index, + IndexT *point_types, + IndexT *undecided_buffer, + unsigned int size) +{ + unsigned int global_id = blockDim.x * blockIdx.x + threadIdx.x; + unsigned int global_size = gridDim.x * blockDim.x; + + unsigned int num_undecided = 0; + for (unsigned int i = global_id; i < size; i += global_size) + { + unsigned int max_state = work_state[i]; + unsigned int max_index = work_index[i]; + + if (point_types[i] == viennacl::linalg::detail::amg::amg_level_context::POINT_TYPE_UNDECIDED) + { + if (i == max_index) // make this a MIS node + point_types[i] = viennacl::linalg::detail::amg::amg_level_context::POINT_TYPE_COARSE; + else if (max_state == 2) // mind the mapping of viennacl::linalg::detail::amg::amg_level_context::POINT_TYPE_COARSE above! + point_types[i] = viennacl::linalg::detail::amg::amg_level_context::POINT_TYPE_FINE; + else + num_undecided += 1; + } + } + + // reduction of the number of undecided nodes: + __shared__ unsigned int shared_buffer[256]; + shared_buffer[threadIdx.x] = num_undecided; + for (unsigned int stride = blockDim.x/2; stride > 0; stride /= 2) + { + __syncthreads(); + if (threadIdx.x < stride) + shared_buffer[threadIdx.x] += shared_buffer[threadIdx.x+stride]; + } + + if (threadIdx.x == 0) + undecided_buffer[blockIdx.x] = shared_buffer[0]; + +} + +/** @brief CUDA kernel for resetting non-MIS (i.e. coarse) points to undecided so that subsequent kernels work */ +__global__ void amg_pmis2_reset_state(unsigned int *point_types, unsigned int size) +{ + unsigned int global_id = blockDim.x * blockIdx.x + threadIdx.x; + unsigned int global_size = gridDim.x * blockDim.x; + + for (unsigned int i = global_id; i < size; i += global_size) + { + if (point_types[i] != viennacl::linalg::detail::amg::amg_level_context::POINT_TYPE_COARSE) + point_types[i] = viennacl::linalg::detail::amg::amg_level_context::POINT_TYPE_UNDECIDED; + } +} + +/** @brief AG (aggregation based) coarsening, single-threaded version of stage 1 +* +* @param A Operator matrix on all levels +* @param amg_context AMG hierarchy datastructures +* @param tag AMG preconditioner tag +*/ +template<typename NumericT> +void amg_coarse_ag_stage1_mis2(compressed_matrix<NumericT> const & A, + viennacl::linalg::detail::amg::amg_level_context & amg_context, + viennacl::linalg::amg_tag & tag) +{ + viennacl::vector<unsigned int> random_weights(A.size1(), viennacl::context(viennacl::MAIN_MEMORY)); + unsigned int *random_weights_ptr = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(random_weights.handle()); + for (std::size_t i=0; i<random_weights.size(); ++i) + random_weights_ptr[i] = static_cast<unsigned int>(rand()) % static_cast<unsigned int>(A.size1()); + random_weights.switch_memory_context(viennacl::traits::context(A)); + + // work vectors: + viennacl::vector<unsigned int> work_state(A.size1(), viennacl::traits::context(A)); + viennacl::vector<unsigned int> work_random(A.size1(), viennacl::traits::context(A)); + viennacl::vector<unsigned int> work_index(A.size1(), viennacl::traits::context(A)); + + viennacl::vector<unsigned int> work_state2(A.size1(), viennacl::traits::context(A)); + viennacl::vector<unsigned int> work_random2(A.size1(), viennacl::traits::context(A)); + viennacl::vector<unsigned int> work_index2(A.size1(), viennacl::traits::context(A)); + + unsigned int num_undecided = static_cast<unsigned int>(A.size1()); + viennacl::vector<unsigned int> undecided_buffer(256, viennacl::traits::context(A)); + viennacl::backend::typesafe_host_array<unsigned int> undecided_buffer_host(undecided_buffer.handle(), undecided_buffer.size()); + + unsigned int pmis_iters = 0; + while (num_undecided > 0) + { + ++pmis_iters; + + // + // init temporary work data: + // + amg_pmis2_init_workdata<<<128, 128>>>(viennacl::cuda_arg(work_state), + viennacl::cuda_arg(work_random), + viennacl::cuda_arg(work_index), + viennacl::cuda_arg(amg_context.point_types_), + viennacl::cuda_arg(random_weights), + static_cast<unsigned int>(A.size1()) + ); + VIENNACL_CUDA_LAST_ERROR_CHECK("amg_pmis2_reset_state"); + + + // + // Propagate maximum tuple twice + // + for (unsigned int r = 0; r < 2; ++r) + { + // max operation over neighborhood + amg_pmis2_max_neighborhood<<<128, 128>>>(viennacl::cuda_arg(work_state), + viennacl::cuda_arg(work_random), + viennacl::cuda_arg(work_index), + viennacl::cuda_arg(work_state2), + viennacl::cuda_arg(work_random2), + viennacl::cuda_arg(work_index2), + viennacl::cuda_arg(amg_context.influence_jumper_), + viennacl::cuda_arg(amg_context.influence_ids_), + static_cast<unsigned int>(A.size1()) + ); + VIENNACL_CUDA_LAST_ERROR_CHECK("amg_pmis2_max_neighborhood"); + + // copy work array (can be fused into a single kernel if needed. Previous kernel is in most cases sufficiently heavy) + work_state = work_state2; + work_random = work_random2; + work_index = work_index2; + } + + // + // mark MIS and non-MIS nodes: + // + amg_pmis2_mark_mis_nodes<<<128, 128>>>(viennacl::cuda_arg(work_state), + viennacl::cuda_arg(work_index), + viennacl::cuda_arg(amg_context.point_types_), + viennacl::cuda_arg(undecided_buffer), + static_cast<unsigned int>(A.size1()) + ); + VIENNACL_CUDA_LAST_ERROR_CHECK("amg_pmis2_reset_state"); + + // get number of undecided points on host: + viennacl::backend::memory_read(undecided_buffer.handle(), 0, undecided_buffer_host.raw_size(), undecided_buffer_host.get()); + num_undecided = 0; + for (std::size_t i=0; i<undecided_buffer.size(); ++i) + num_undecided += undecided_buffer_host[i]; + + } //while + + // consistency with sequential MIS: reset state for non-coarse points, so that coarse indices are correctly picked up later + amg_pmis2_reset_state<<<128, 128>>>(viennacl::cuda_arg(amg_context.point_types_), + static_cast<unsigned int>(amg_context.point_types_.size()) + ); + VIENNACL_CUDA_LAST_ERROR_CHECK("amg_pmis2_reset_state"); +} + + + + + +template<typename IndexT> +__global__ void amg_agg_propagate_coarse_indices(IndexT *point_types, + IndexT *coarse_ids, + IndexT const *influences_row, + IndexT const *influences_id, + unsigned int size) +{ + unsigned int global_id = blockDim.x * blockIdx.x + threadIdx.x; + unsigned int global_size = gridDim.x * blockDim.x; + + for (unsigned int i = global_id; i < size; i += global_size) + { + if (point_types[i] == viennacl::linalg::detail::amg::amg_level_context::POINT_TYPE_COARSE) + { + unsigned int coarse_index = coarse_ids[i]; + + unsigned int j_stop = influences_row[i + 1]; + for (unsigned int j = influences_row[i]; j < j_stop; ++j) + { + unsigned int influenced_point_id = influences_id[j]; + coarse_ids[influenced_point_id] = coarse_index; // Set aggregate index for fine point + + if (influenced_point_id != i) // Note: Any write races between threads are harmless here + point_types[influenced_point_id] = viennacl::linalg::detail::amg::amg_level_context::POINT_TYPE_FINE; + } + } + } +} + + +template<typename IndexT> +__global__ void amg_agg_merge_undecided(IndexT *point_types, + IndexT *coarse_ids, + IndexT const *influences_row, + IndexT const *influences_id, + unsigned int size) +{ + unsigned int global_id = blockDim.x * blockIdx.x + threadIdx.x; + unsigned int global_size = gridDim.x * blockDim.x; + + for (unsigned int i = global_id; i < size; i += global_size) + { + if (point_types[i] == viennacl::linalg::detail::amg::amg_level_context::POINT_TYPE_UNDECIDED) + { + unsigned int j_stop = influences_row[i + 1]; + for (unsigned int j = influences_row[i]; j < j_stop; ++j) + { + unsigned int influenced_point_id = influences_id[j]; + if (point_types[influenced_point_id] != viennacl::linalg::detail::amg::amg_level_context::POINT_TYPE_UNDECIDED) // either coarse or fine point + { + //std::cout << "Setting fine node " << i << " to be aggregated with node " << *influence_iter << "/" << pointvector.get_coarse_index(*influence_iter) << std::endl; + coarse_ids[i] = coarse_ids[influenced_point_id]; + break; + } + } + } + } +} + + +__global__ void amg_agg_merge_undecided_2(unsigned int *point_types, + unsigned int size) +{ + unsigned int global_id = blockDim.x * blockIdx.x + threadIdx.x; + unsigned int global_size = gridDim.x * blockDim.x; + + for (unsigned int i = global_id; i < size; i += global_size) + { + if (point_types[i] == viennacl::linalg::detail::amg::amg_level_context::POINT_TYPE_UNDECIDED) + point_types[i] = viennacl::linalg::detail::amg::amg_level_context::POINT_TYPE_FINE; + } +} + + +/** @brief AG (aggregation based) coarsening. Partially single-threaded version (VIENNACL_AMG_COARSE_AG) +* +* @param A Operator matrix +* @param amg_context AMG hierarchy datastructures +* @param tag AMG preconditioner tag +*/ +template<typename NumericT> +void amg_coarse_ag(compressed_matrix<NumericT> const & A, + viennacl::linalg::detail::amg::amg_level_context & amg_context, + viennacl::linalg::amg_tag & tag) +{ + + amg_influence_trivial(A, amg_context, tag); + + // + // Stage 1: Build aggregates: + // + if (tag.get_coarsening_method() == viennacl::linalg::AMG_COARSENING_METHOD_MIS2_AGGREGATION) + amg_coarse_ag_stage1_mis2(A, amg_context, tag); + else + throw std::runtime_error("Only MIS2 coarsening implemented. Selected coarsening not available with CUDA backend!"); + + viennacl::linalg::cuda::amg::enumerate_coarse_points(amg_context); + + // + // Stage 2: Propagate coarse aggregate indices to neighbors: + // + amg_agg_propagate_coarse_indices<<<128, 128>>>(viennacl::cuda_arg(amg_context.point_types_), + viennacl::cuda_arg(amg_context.coarse_id_), + viennacl::cuda_arg(amg_context.influence_jumper_), + viennacl::cuda_arg(amg_context.influence_ids_), + static_cast<unsigned int>(A.size1()) + ); + VIENNACL_CUDA_LAST_ERROR_CHECK("amg_agg_propagate_coarse_indices"); + + + // + // Stage 3: Merge remaining undecided points (merging to first aggregate found when cycling over the hierarchy + // + amg_agg_merge_undecided<<<128, 128>>>(viennacl::cuda_arg(amg_context.point_types_), + viennacl::cuda_arg(amg_context.coarse_id_), + viennacl::cuda_arg(amg_context.influence_jumper_), + viennacl::cuda_arg(amg_context.influence_ids_), + static_cast<unsigned int>(A.size1()) + ); + VIENNACL_CUDA_LAST_ERROR_CHECK("amg_agg_merge_undecided"); + + // + // Stage 4: Set undecided points to fine points (coarse ID already set in Stage 3) + // Note: Stage 3 and Stage 4 were initially fused, but are now split in order to avoid race conditions (or a fallback to sequential execution). + // + amg_agg_merge_undecided_2<<<128, 128>>>(viennacl::cuda_arg(amg_context.point_types_), + static_cast<unsigned int>(A.size1()) + ); + VIENNACL_CUDA_LAST_ERROR_CHECK("amg_agg_merge_undecided_2"); +} + + + + +/** @brief Calls the right coarsening procedure +* +* @param A Operator matrix on all levels +* @param amg_context AMG hierarchy datastructures +* @param tag AMG preconditioner tag +*/ +template<typename InternalT1> +void amg_coarse(InternalT1 & A, + viennacl::linalg::detail::amg::amg_level_context & amg_context, + viennacl::linalg::amg_tag & tag) +{ + switch (tag.get_coarsening_method()) + { + case viennacl::linalg::AMG_COARSENING_METHOD_MIS2_AGGREGATION: amg_coarse_ag(A, amg_context, tag); break; + default: throw std::runtime_error("not implemented yet"); + } +} + + + + +////////////////////////////////////// Interpolation ///////////////////////////// + +template<typename NumericT> +__global__ void amg_interpol_ag_kernel(unsigned int *P_row_buffer, + unsigned int *P_col_buffer, + NumericT *P_elements, + unsigned int *coarse_ids, + unsigned int size) +{ + unsigned int global_id = blockDim.x * blockIdx.x + threadIdx.x; + unsigned int global_size = gridDim.x * blockDim.x; + + for (unsigned int i = global_id; i < size; i += global_size) + { + P_row_buffer[i] = i; + P_col_buffer[i] = coarse_ids[i]; + P_elements[i] = NumericT(1); + } + + // set last entry as well: + if (global_id == 0) + P_row_buffer[size] = size; +} + +/** @brief AG (aggregation based) interpolation. Multi-Threaded! (VIENNACL_INTERPOL_SA) + * + * @param A Operator matrix + * @param P Prolongation matrix + * @param amg_context AMG hierarchy datastructures + * @param tag AMG configuration tag +*/ +template<typename NumericT> +void amg_interpol_ag(compressed_matrix<NumericT> const & A, + compressed_matrix<NumericT> & P, + viennacl::linalg::detail::amg::amg_level_context & amg_context, + viennacl::linalg::amg_tag & tag) +{ + (void)tag; + P = compressed_matrix<NumericT>(A.size1(), amg_context.num_coarse_, A.size1(), viennacl::traits::context(A)); + + amg_interpol_ag_kernel<<<128, 128>>>(viennacl::cuda_arg<unsigned int>(P.handle1().cuda_handle()), + viennacl::cuda_arg<unsigned int>(P.handle2().cuda_handle()), + viennacl::cuda_arg<NumericT>(P.handle().cuda_handle()), + viennacl::cuda_arg(amg_context.coarse_id_), + static_cast<unsigned int>(A.size1()) + ); + VIENNACL_CUDA_LAST_ERROR_CHECK("amg_interpol_ag_kernel"); + + P.generate_row_block_information(); +} + + + +template<typename NumericT> +__global__ void amg_interpol_sa_kernel( + const unsigned int *A_row_indices, + const unsigned int *A_col_indices, + const NumericT *A_elements, + unsigned int A_size1, + unsigned int A_nnz, + unsigned int *Jacobi_row_indices, + unsigned int *Jacobi_col_indices, + NumericT *Jacobi_elements, + NumericT omega + ) +{ + unsigned int global_id = blockDim.x * blockIdx.x + threadIdx.x; + unsigned int global_size = gridDim.x * blockDim.x; + + for (unsigned int row = global_id; row < A_size1; row += global_size) + { + unsigned int row_begin = A_row_indices[row]; + unsigned int row_end = A_row_indices[row+1]; + + Jacobi_row_indices[row] = row_begin; + + // Step 1: Extract diagonal: + NumericT diag = 0; + for (unsigned int j = row_begin; j < row_end; ++j) + { + if (A_col_indices[j] == row) + { + diag = A_elements[j]; + break; + } + } + + // Step 2: Write entries: + for (unsigned int j = row_begin; j < row_end; ++j) + { + unsigned int col_index = A_col_indices[j]; + Jacobi_col_indices[j] = col_index; + + if (col_index == row) + Jacobi_elements[j] = NumericT(1) - omega; + else + Jacobi_elements[j] = - omega * A_elements[j] / diag; + } + } + + if (global_id == 0) + Jacobi_row_indices[A_size1] = A_nnz; // don't forget finalizer +} + + + +/** @brief Smoothed aggregation interpolation. (VIENNACL_INTERPOL_SA) + * + * @param A Operator matrix + * @param P Prolongation matrix + * @param amg_context AMG hierarchy datastructures + * @param tag AMG configuration tag +*/ +template<typename NumericT> +void amg_interpol_sa(compressed_matrix<NumericT> const & A, + compressed_matrix<NumericT> & P, + viennacl::linalg::detail::amg::amg_level_context & amg_context, + viennacl::linalg::amg_tag & tag) +{ + (void)tag; + viennacl::compressed_matrix<NumericT> P_tentative(A.size1(), amg_context.num_coarse_, A.size1(), viennacl::traits::context(A)); + + // form tentative operator: + amg_interpol_ag(A, P_tentative, amg_context, tag); + + viennacl::compressed_matrix<NumericT> Jacobi(A.size1(), A.size1(), A.nnz(), viennacl::traits::context(A)); + + amg_interpol_sa_kernel<<<128, 128>>>(viennacl::cuda_arg<unsigned int>(A.handle1().cuda_handle()), + viennacl::cuda_arg<unsigned int>(A.handle2().cuda_handle()), + viennacl::cuda_arg<NumericT>(A.handle().cuda_handle()), + static_cast<unsigned int>(A.size1()), + static_cast<unsigned int>(A.nnz()), + viennacl::cuda_arg<unsigned int>(Jacobi.handle1().cuda_handle()), + viennacl::cuda_arg<unsigned int>(Jacobi.handle2().cuda_handle()), + viennacl::cuda_arg<NumericT>(Jacobi.handle().cuda_handle()), + NumericT(tag.get_jacobi_weight()) + ); + VIENNACL_CUDA_LAST_ERROR_CHECK("amg_interpol_sa_kernel"); + + P = viennacl::linalg::prod(Jacobi, P_tentative); + + P.generate_row_block_information(); +} + + +/** @brief Dispatcher for building the interpolation matrix + * + * @param A Operator matrix + * @param P Prolongation matrix + * @param amg_context AMG hierarchy datastructures + * @param tag AMG configuration tag +*/ +template<typename MatrixT> +void amg_interpol(MatrixT const & A, + MatrixT & P, + viennacl::linalg::detail::amg::amg_level_context & amg_context, + viennacl::linalg::amg_tag & tag) +{ + switch (tag.get_interpolation_method()) + { + case viennacl::linalg::AMG_INTERPOLATION_METHOD_AGGREGATION: amg_interpol_ag (A, P, amg_context, tag); break; + case viennacl::linalg::AMG_INTERPOLATION_METHOD_SMOOTHED_AGGREGATION: amg_interpol_sa (A, P, amg_context, tag); break; + default: throw std::runtime_error("Not implemented yet!"); + } +} + + +template<typename NumericT> +__global__ void compressed_matrix_assign_to_dense( + const unsigned int * row_indices, + const unsigned int * column_indices, + const NumericT * elements, + NumericT *B, + unsigned int B_row_start, unsigned int B_col_start, + unsigned int B_row_inc, unsigned int B_col_inc, + unsigned int B_row_size, unsigned int B_col_size, + unsigned int B_internal_rows, unsigned int B_internal_cols) +{ + for (unsigned int row = blockDim.x * blockIdx.x + threadIdx.x; + row < B_row_size; + row += gridDim.x * blockDim.x) + { + unsigned int row_end = row_indices[row+1]; + for (unsigned int j = row_indices[row]; j<row_end; j++) + B[(B_row_start + row * B_row_inc) * B_internal_cols + B_col_start + column_indices[j] * B_col_inc] = elements[j]; + } +} + + +template<typename NumericT, unsigned int AlignmentV> +void assign_to_dense(viennacl::compressed_matrix<NumericT, AlignmentV> const & A, + viennacl::matrix_base<NumericT> & B) +{ + compressed_matrix_assign_to_dense<<<128, 128>>>(viennacl::cuda_arg<unsigned int>(A.handle1().cuda_handle()), + viennacl::cuda_arg<unsigned int>(A.handle2().cuda_handle()), + viennacl::cuda_arg<NumericT>(A.handle().cuda_handle()), + viennacl::cuda_arg<NumericT>(B), + static_cast<unsigned int>(viennacl::traits::start1(B)), static_cast<unsigned int>(viennacl::traits::start2(B)), + static_cast<unsigned int>(viennacl::traits::stride1(B)), static_cast<unsigned int>(viennacl::traits::stride2(B)), + static_cast<unsigned int>(viennacl::traits::size1(B)), static_cast<unsigned int>(viennacl::traits::size2(B)), + static_cast<unsigned int>(viennacl::traits::internal_size1(B)), static_cast<unsigned int>(viennacl::traits::internal_size2(B)) + ); + VIENNACL_CUDA_LAST_ERROR_CHECK("compressed_matrix_assign_to_dense"); +} + + + + +template<typename NumericT> +__global__ void compressed_matrix_smooth_jacobi_kernel( + const unsigned int * row_indices, + const unsigned int * column_indices, + const NumericT * elements, + NumericT weight, + const NumericT * x_old, + NumericT * x_new, + const NumericT * rhs, + unsigned int size) +{ + for (unsigned int row = blockDim.x * blockIdx.x + threadIdx.x; + row < size; + row += gridDim.x * blockDim.x) + { + NumericT sum = NumericT(0); + NumericT diag = NumericT(1); + unsigned int row_end = row_indices[row+1]; + for (unsigned int j = row_indices[row]; j < row_end; ++j) + { + unsigned int col = column_indices[j]; + if (col == row) + diag = elements[j]; + else + sum += elements[j] * x_old[col]; + } + x_new[row] = weight * (rhs[row] - sum) / diag + (NumericT(1) - weight) * x_old[row]; + } +} + + + + +/** @brief Damped Jacobi Smoother (CUDA version) +* +* @param iterations Number of smoother iterations +* @param A Operator matrix for the smoothing +* @param x The vector smoothing is applied to +* @param x_backup (Different) Vector holding the same values as x +* @param rhs_smooth The right hand side of the equation for the smoother +* @param weight Damping factor. 0: No effect of smoother. 1: Undamped Jacobi iteration +*/ +template<typename NumericT> +void smooth_jacobi(unsigned int iterations, + compressed_matrix<NumericT> const & A, + vector<NumericT> & x, + vector<NumericT> & x_backup, + vector<NumericT> const & rhs_smooth, + NumericT weight) +{ + for (unsigned int i=0; i<iterations; ++i) + { + x_backup = x; + + compressed_matrix_smooth_jacobi_kernel<<<128, 128>>>(viennacl::cuda_arg<unsigned int>(A.handle1().cuda_handle()), + viennacl::cuda_arg<unsigned int>(A.handle2().cuda_handle()), + viennacl::cuda_arg<NumericT>(A.handle().cuda_handle()), + static_cast<NumericT>(weight), + viennacl::cuda_arg(x_backup), + viennacl::cuda_arg(x), + viennacl::cuda_arg(rhs_smooth), + static_cast<unsigned int>(rhs_smooth.size()) + ); + VIENNACL_CUDA_LAST_ERROR_CHECK("compressed_matrix_smooth_jacobi_kernel"); + } +} + + +} //namespace amg +} //namespace host_based +} //namespace linalg +} //namespace viennacl + +#endif
http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/bisect_kernel_calls.hpp ---------------------------------------------------------------------- diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/bisect_kernel_calls.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/bisect_kernel_calls.hpp new file mode 100644 index 0000000..39f0015 --- /dev/null +++ b/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/bisect_kernel_calls.hpp @@ -0,0 +1,166 @@ +#ifndef VIENNACL_LINALG_CUDA_BISECT_KERNEL_CALLS_HPP_ +#define VIENNACL_LINALG_CUDA_BISECT_KERNEL_CALLS_HPP_ + +/* ========================================================================= + Copyright (c) 2010-2016, Institute for Microelectronics, + Institute for Analysis and Scientific Computing, + TU Wien. + Portions of this software are copyright by UChicago Argonne, LLC. + ----------------- + ViennaCL - The Vienna Computing Library + ----------------- + + Project Head: Karl Rupp [email protected] + + (A list of authors and contributors can be found in the manual) + + License: MIT (X11), see file LICENSE in the base directory +============================================================================= */ + + +/** @file viennacl/linalg/cuda/bisect_kernel_calls.hpp + @brief CUDA kernel calls for the bisection algorithm + + Implementation based on the sample provided with the CUDA 6.0 SDK, for which + the creation of derivative works is allowed by including the following statement: + "This software contains source code provided by NVIDIA Corporation." +*/ + +#include "viennacl/linalg/detail/bisect/structs.hpp" + +// includes, kernels +#include "viennacl/linalg/cuda/bisect_kernel_small.hpp" +#include "viennacl/linalg/cuda/bisect_kernel_large.hpp" +#include "viennacl/linalg/cuda/bisect_kernel_large_onei.hpp" +#include "viennacl/linalg/cuda/bisect_kernel_large_multi.hpp" + + +namespace viennacl +{ +namespace linalg +{ +namespace cuda +{ +template<typename NumericT> +void bisectSmall(const viennacl::linalg::detail::InputData<NumericT> &input, viennacl::linalg::detail::ResultDataSmall<NumericT> &result, + const unsigned int mat_size, + const NumericT lg, const NumericT ug, + const NumericT precision) +{ + + + dim3 blocks(1, 1, 1); + dim3 threads(VIENNACL_BISECT_MAX_THREADS_BLOCK_SMALL_MATRIX, 1, 1); + + bisectKernelSmall<<< blocks, threads >>>( + viennacl::cuda_arg(input.g_a), + viennacl::cuda_arg(input.g_b) + 1, + mat_size, + viennacl::cuda_arg(result.vcl_g_left), + viennacl::cuda_arg(result.vcl_g_right), + viennacl::cuda_arg(result.vcl_g_left_count), + viennacl::cuda_arg(result.vcl_g_right_count), + lg, ug, 0, mat_size, + precision + ); + viennacl::linalg::cuda::VIENNACL_CUDA_LAST_ERROR_CHECK("Kernel launch failed"); +} + + +template<typename NumericT> +void bisectLarge(const viennacl::linalg::detail::InputData<NumericT> &input, viennacl::linalg::detail::ResultDataLarge<NumericT> &result, + const unsigned int mat_size, + const NumericT lg, const NumericT ug, + const NumericT precision) + { + + dim3 blocks(1, 1, 1); + dim3 threads(mat_size > 512 ? VIENNACL_BISECT_MAX_THREADS_BLOCK : VIENNACL_BISECT_MAX_THREADS_BLOCK / 2 , 1, 1); + bisectKernelLarge<<< blocks, threads >>> + (viennacl::cuda_arg(input.g_a), + viennacl::cuda_arg(input.g_b) + 1, + mat_size, + lg, ug, static_cast<unsigned int>(0), mat_size, precision, + viennacl::cuda_arg(result.g_num_one), + viennacl::cuda_arg(result.g_num_blocks_mult), + viennacl::cuda_arg(result.g_left_one), + viennacl::cuda_arg(result.g_right_one), + viennacl::cuda_arg(result.g_pos_one), + viennacl::cuda_arg(result.g_left_mult), + viennacl::cuda_arg(result.g_right_mult), + viennacl::cuda_arg(result.g_left_count_mult), + viennacl::cuda_arg(result.g_right_count_mult), + viennacl::cuda_arg(result.g_blocks_mult), + viennacl::cuda_arg(result.g_blocks_mult_sum) + ); + viennacl::linalg::cuda::VIENNACL_CUDA_LAST_ERROR_CHECK("Kernel launch failed."); +} + + +// compute eigenvalues for intervals that contained only one eigenvalue +// after the first processing step +template<typename NumericT> +void bisectLarge_OneIntervals(const viennacl::linalg::detail::InputData<NumericT> &input, viennacl::linalg::detail::ResultDataLarge<NumericT> &result, + const unsigned int mat_size, + const NumericT precision) + { + + unsigned int num_one_intervals = result.g_num_one; + unsigned int num_blocks = viennacl::linalg::detail::getNumBlocksLinear(num_one_intervals, + mat_size > 512 ? VIENNACL_BISECT_MAX_THREADS_BLOCK : VIENNACL_BISECT_MAX_THREADS_BLOCK / 2); + dim3 grid_onei; + grid_onei.x = num_blocks; + grid_onei.y = 1, grid_onei.z = 1; + dim3 threads_onei(mat_size > 512 ? VIENNACL_BISECT_MAX_THREADS_BLOCK : VIENNACL_BISECT_MAX_THREADS_BLOCK / 2, 1, 1); + + + bisectKernelLarge_OneIntervals<<< grid_onei , threads_onei >>> + (viennacl::cuda_arg(input.g_a), + viennacl::cuda_arg(input.g_b) + 1, + mat_size, num_one_intervals, + viennacl::cuda_arg(result.g_left_one), + viennacl::cuda_arg(result.g_right_one), + viennacl::cuda_arg(result.g_pos_one), + precision + ); + viennacl::linalg::cuda::VIENNACL_CUDA_LAST_ERROR_CHECK("bisectKernelLarge_OneIntervals() FAILED."); +} + + +// process intervals that contained more than one eigenvalue after +// the first processing step +template<typename NumericT> +void bisectLarge_MultIntervals(const viennacl::linalg::detail::InputData<NumericT> &input, viennacl::linalg::detail::ResultDataLarge<NumericT> &result, + const unsigned int mat_size, + const NumericT precision) + { + // get the number of blocks of intervals that contain, in total when + // each interval contains only one eigenvalue, not more than + // MAX_THREADS_BLOCK threads + unsigned int num_blocks_mult = result.g_num_blocks_mult; + + // setup the execution environment + dim3 grid_mult(num_blocks_mult, 1, 1); + dim3 threads_mult(mat_size > 512 ? VIENNACL_BISECT_MAX_THREADS_BLOCK : VIENNACL_BISECT_MAX_THREADS_BLOCK / 2, 1, 1); + + bisectKernelLarge_MultIntervals<<< grid_mult, threads_mult >>> + (viennacl::cuda_arg(input.g_a), + viennacl::cuda_arg(input.g_b) + 1, + mat_size, + viennacl::cuda_arg(result.g_blocks_mult), + viennacl::cuda_arg(result.g_blocks_mult_sum), + viennacl::cuda_arg(result.g_left_mult), + viennacl::cuda_arg(result.g_right_mult), + viennacl::cuda_arg(result.g_left_count_mult), + viennacl::cuda_arg(result.g_right_count_mult), + viennacl::cuda_arg(result.g_lambda_mult), + viennacl::cuda_arg(result.g_pos_mult), + precision + ); + viennacl::linalg::cuda::VIENNACL_CUDA_LAST_ERROR_CHECK("bisectKernelLarge_MultIntervals() FAILED."); +} +} +} +} + +#endif http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/bisect_kernel_large.hpp ---------------------------------------------------------------------- diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/bisect_kernel_large.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/bisect_kernel_large.hpp new file mode 100755 index 0000000..77c9773 --- /dev/null +++ b/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/bisect_kernel_large.hpp @@ -0,0 +1,928 @@ +#ifndef VIENNACL_LINALG_CUDA_BISECT_KERNEL_LARGE_HPP_ +#define VIENNACL_LINALG_CUDA_BISECT_KERNEL_LARGE_HPP_ + +/* ========================================================================= + Copyright (c) 2010-2016, Institute for Microelectronics, + Institute for Analysis and Scientific Computing, + TU Wien. + Portions of this software are copyright by UChicago Argonne, LLC. + + ----------------- + ViennaCL - The Vienna Computing Library + ----------------- + + Project Head: Karl Rupp [email protected] + + (A list of authors and contributors can be found in the manual) + + License: MIT (X11), see file LICENSE in the base directory +============================================================================= */ + + +/** @file viennacl/linalg/cuda/bisect_kernel_large.hpp + @brief First step of the bisection algorithm for the computation of eigenvalues. + + Implementation based on the sample provided with the CUDA 6.0 SDK, for which + the creation of derivative works is allowed by including the following statement: + "This software contains source code provided by NVIDIA Corporation." +*/ + + +/* Determine eigenvalues for large symmetric, tridiagonal matrix. First + step of the computation. */ + +// includes, project +#include "viennacl/linalg/detail/bisect/config.hpp" +#include "viennacl/linalg/detail/bisect/util.hpp" +// additional kernel +#include "viennacl/linalg/cuda/bisect_util.hpp" + +// declaration, forward + +namespace viennacl +{ +namespace linalg +{ +namespace cuda +{ +//////////////////////////////////////////////////////////////////////////////// +//! Write data to global memory +//////////////////////////////////////////////////////////////////////////////// +template<typename NumericT> +__device__ +void writeToGmem(const unsigned int tid, const unsigned int tid_2, + const unsigned int num_threads_active, + const unsigned int num_blocks_mult, + NumericT *g_left_one, NumericT *g_right_one, + unsigned int *g_pos_one, + NumericT *g_left_mult, NumericT *g_right_mult, + unsigned int *g_left_count_mult, + unsigned int *g_right_count_mult, + NumericT *s_left, NumericT *s_right, + unsigned short *s_left_count, unsigned short *s_right_count, + unsigned int *g_blocks_mult, + unsigned int *g_blocks_mult_sum, + unsigned short *s_compaction_list, + unsigned short *s_cl_helper, + unsigned int offset_mult_lambda + ) +{ + + if (tid < offset_mult_lambda) + { + + g_left_one[tid] = s_left[tid]; + g_right_one[tid] = s_right[tid]; + // right count can be used to order eigenvalues without sorting + g_pos_one[tid] = s_right_count[tid]; + } + else + { + + + g_left_mult[tid - offset_mult_lambda] = s_left[tid]; + g_right_mult[tid - offset_mult_lambda] = s_right[tid]; + g_left_count_mult[tid - offset_mult_lambda] = s_left_count[tid]; + g_right_count_mult[tid - offset_mult_lambda] = s_right_count[tid]; + } + + if (tid_2 < num_threads_active) + { + + if (tid_2 < offset_mult_lambda) + { + + g_left_one[tid_2] = s_left[tid_2]; + g_right_one[tid_2] = s_right[tid_2]; + // right count can be used to order eigenvalues without sorting + g_pos_one[tid_2] = s_right_count[tid_2]; + } + else + { + + g_left_mult[tid_2 - offset_mult_lambda] = s_left[tid_2]; + g_right_mult[tid_2 - offset_mult_lambda] = s_right[tid_2]; + g_left_count_mult[tid_2 - offset_mult_lambda] = s_left_count[tid_2]; + g_right_count_mult[tid_2 - offset_mult_lambda] = s_right_count[tid_2]; + } + + } // end writing out data + + __syncthreads(); + + // note that s_cl_blocking = s_compaction_list + 1;, that is by writing out + // s_compaction_list we write the exclusive scan result + if (tid <= num_blocks_mult) + { + g_blocks_mult[tid] = s_compaction_list[tid]; + g_blocks_mult_sum[tid] = s_cl_helper[tid]; + } + + if (tid_2 <= num_blocks_mult) + { + g_blocks_mult[tid_2] = s_compaction_list[tid_2]; + g_blocks_mult_sum[tid_2] = s_cl_helper[tid_2]; + } +} + +//////////////////////////////////////////////////////////////////////////////// +//! Perform final stream compaction before writing data to global memory +//////////////////////////////////////////////////////////////////////////////// +template<typename NumericT> +__device__ +void +compactStreamsFinal(const unsigned int tid, const unsigned int tid_2, + const unsigned int num_threads_active, + unsigned int &offset_mult_lambda, + NumericT *s_left, NumericT *s_right, + unsigned short *s_left_count, unsigned short *s_right_count, + unsigned short *s_cl_one, unsigned short *s_cl_mult, + unsigned short *s_cl_blocking, unsigned short *s_cl_helper, + unsigned int is_one_lambda, unsigned int is_one_lambda_2, + NumericT &left, NumericT &right, NumericT &left_2, NumericT &right_2, + unsigned int &left_count, unsigned int &right_count, + unsigned int &left_count_2, unsigned int &right_count_2, + unsigned int c_block_iend, unsigned int c_sum_block, + unsigned int c_block_iend_2, unsigned int c_sum_block_2 + ) +{ + // cache data before performing compaction + left = s_left[tid]; + right = s_right[tid]; + + if (tid_2 < num_threads_active) + { + + left_2 = s_left[tid_2]; + right_2 = s_right[tid_2]; + } + + __syncthreads(); + + // determine addresses for intervals containing multiple eigenvalues and + // addresses for blocks of intervals + unsigned int ptr_w = 0; + unsigned int ptr_w_2 = 0; + unsigned int ptr_blocking_w = 0; + unsigned int ptr_blocking_w_2 = 0; + + + + ptr_w = (1 == is_one_lambda) ? s_cl_one[tid] + : s_cl_mult[tid] + offset_mult_lambda; + + if (0 != c_block_iend) + { + ptr_blocking_w = s_cl_blocking[tid]; + } + + if (tid_2 < num_threads_active) + { + ptr_w_2 = (1 == is_one_lambda_2) ? s_cl_one[tid_2] + : s_cl_mult[tid_2] + offset_mult_lambda; + + if (0 != c_block_iend_2) + { + ptr_blocking_w_2 = s_cl_blocking[tid_2]; + } + } + + + __syncthreads(); + if(tid < num_threads_active) + { + // store compactly in shared mem + s_left[ptr_w] = left; + s_right[ptr_w] = right; + s_left_count[ptr_w] = left_count; + s_right_count[ptr_w] = right_count; + } + + + __syncthreads(); + if(tid == 1) + { + s_left[ptr_w] = left; + s_right[ptr_w] = right; + s_left_count[ptr_w] = left_count; + s_right_count[ptr_w] = right_count; + } + if (0 != c_block_iend) + { + s_cl_blocking[ptr_blocking_w + 1] = c_block_iend - 1; + s_cl_helper[ptr_blocking_w + 1] = c_sum_block; + } + + if (tid_2 < num_threads_active) + { + + // store compactly in shared mem + s_left[ptr_w_2] = left_2; + s_right[ptr_w_2] = right_2; + s_left_count[ptr_w_2] = left_count_2; + s_right_count[ptr_w_2] = right_count_2; + + if (0 != c_block_iend_2) + { + s_cl_blocking[ptr_blocking_w_2 + 1] = c_block_iend_2 - 1; + s_cl_helper[ptr_blocking_w_2 + 1] = c_sum_block_2; + } + } + +} + +//////////////////////////////////////////////////////////////////////////////// +//! Compute addresses to obtain compact list of block start addresses +//////////////////////////////////////////////////////////////////////////////// +inline __device__ +void +scanCompactBlocksStartAddress(const unsigned int tid, const unsigned int tid_2, + const unsigned int num_threads_compaction, + unsigned short *s_cl_blocking, + unsigned short *s_cl_helper + ) +{ + // prepare for second step of block generation: compaction of the block + // list itself to efficiently write out these + s_cl_blocking[tid] = s_cl_helper[tid]; + + if (tid_2 < num_threads_compaction) + { + s_cl_blocking[tid_2] = s_cl_helper[tid_2]; + } + + __syncthreads(); + + // additional scan to compact s_cl_blocking that permits to generate a + // compact list of eigenvalue blocks each one containing about + // VIENNACL_BISECT_MAX_THREADS_BLOCK eigenvalues (so that each of these blocks may be + // processed by one thread block in a subsequent processing step + + unsigned int offset = 1; + + // build scan tree + for (int d = (num_threads_compaction >> 1); d > 0; d >>= 1) + { + + __syncthreads(); + + if (tid < d) + { + + unsigned int ai = offset*(2*tid+1)-1; + unsigned int bi = offset*(2*tid+2)-1; + s_cl_blocking[bi] = s_cl_blocking[bi] + s_cl_blocking[ai]; + } + + offset <<= 1; + } + + // traverse down tree: first down to level 2 across + for (int d = 2; d < num_threads_compaction; d <<= 1) + { + + offset >>= 1; + __syncthreads(); + + // + if (tid < (d-1)) + { + + unsigned int ai = offset*(tid+1) - 1; + unsigned int bi = ai + (offset >> 1); + s_cl_blocking[bi] = s_cl_blocking[bi] + s_cl_blocking[ai]; + } + } + +} + +//////////////////////////////////////////////////////////////////////////////// +//! Perform scan to obtain number of eigenvalues before a specific block +//////////////////////////////////////////////////////////////////////////////// +inline __device__ +void +scanSumBlocks(const unsigned int tid, const unsigned int tid_2, + const unsigned int num_threads_active, + const unsigned int num_threads_compaction, + unsigned short *s_cl_blocking, + unsigned short *s_cl_helper) +{ + unsigned int offset = 1; + + // first step of scan to build the sum of elements within each block + // build up tree + for (int d = num_threads_compaction >> 1; d > 0; d >>= 1) + { + + __syncthreads(); + + if (tid < d) + { + + unsigned int ai = offset*(2*tid+1)-1; + unsigned int bi = offset*(2*tid+2)-1; + + s_cl_blocking[bi] += s_cl_blocking[ai]; + } + + offset *= 2; + } + + // first step of scan to build the sum of elements within each block + // traverse down tree + for (int d = 2; d < (num_threads_compaction - 1); d <<= 1) + { + + offset >>= 1; + __syncthreads(); + + if (tid < (d-1)) + { + + unsigned int ai = offset*(tid+1) - 1; + unsigned int bi = ai + (offset >> 1); + + s_cl_blocking[bi] += s_cl_blocking[ai]; + } + } + + __syncthreads(); + + if (0 == tid) + { + + // move last element of scan to last element that is valid + // necessary because the number of threads employed for scan is a power + // of two and not necessarily the number of active threasd + s_cl_helper[num_threads_active - 1] = + s_cl_helper[num_threads_compaction - 1]; + s_cl_blocking[num_threads_active - 1] = + s_cl_blocking[num_threads_compaction - 1]; + } +} + +//////////////////////////////////////////////////////////////////////////////// +//! Perform initial scan for compaction of intervals containing one and +//! multiple eigenvalues; also do initial scan to build blocks +//////////////////////////////////////////////////////////////////////////////// +inline __device__ +void +scanInitial(const unsigned int tid, const unsigned int tid_2, const unsigned int mat_size, + const unsigned int num_threads_active, + const unsigned int num_threads_compaction, + unsigned short *s_cl_one, unsigned short *s_cl_mult, + unsigned short *s_cl_blocking, unsigned short *s_cl_helper + ) +{ + + // perform scan to compactly write out the intervals containing one and + // multiple eigenvalues + // also generate tree for blocking of intervals containing multiple + // eigenvalues + + unsigned int offset = 1; + + // build scan tree + for (int d = (num_threads_compaction >> 1); d > 0; d >>= 1) + { + + __syncthreads(); + + if (tid < d) + { + + unsigned int ai = offset*(2*tid+1); + unsigned int bi = offset*(2*tid+2)-1; + + s_cl_one[bi] = s_cl_one[bi] + s_cl_one[ai - 1]; + s_cl_mult[bi] = s_cl_mult[bi] + s_cl_mult[ai - 1]; + + // s_cl_helper is binary and zero for an internal node and 1 for a + // root node of a tree corresponding to a block + // s_cl_blocking contains the number of nodes in each sub-tree at each + // iteration, the data has to be kept to compute the total number of + // eigenvalues per block that, in turn, is needed to efficiently + // write out data in the second step + if ((s_cl_helper[ai - 1] != 1) || (s_cl_helper[bi] != 1)) + { + + // check how many childs are non terminated + if (s_cl_helper[ai - 1] == 1) + { + // mark as terminated + s_cl_helper[bi] = 1; + } + else if (s_cl_helper[bi] == 1) + { + // mark as terminated + s_cl_helper[ai - 1] = 1; + } + else // both childs are non-terminated + { + + unsigned int temp = s_cl_blocking[bi] + s_cl_blocking[ai - 1]; + + if (temp > (mat_size > 512 ? VIENNACL_BISECT_MAX_THREADS_BLOCK : VIENNACL_BISECT_MAX_THREADS_BLOCK / 2)) + { + + // the two child trees have to form separate blocks, terminate trees + s_cl_helper[ai - 1] = 1; + s_cl_helper[bi] = 1; + } + else + { + // build up tree by joining subtrees + s_cl_blocking[bi] = temp; + s_cl_blocking[ai - 1] = 0; + } + } + } // end s_cl_helper update + + } + + offset <<= 1; + } + + + // traverse down tree, this only for stream compaction, not for block + // construction + for (int d = 2; d < num_threads_compaction; d <<= 1) + { + + offset >>= 1; + __syncthreads(); + + // + if (tid < (d-1)) + { + + unsigned int ai = offset*(tid+1) - 1; + unsigned int bi = ai + (offset >> 1); + + s_cl_one[bi] = s_cl_one[bi] + s_cl_one[ai]; + s_cl_mult[bi] = s_cl_mult[bi] + s_cl_mult[ai]; + } + } + +} + +//////////////////////////////////////////////////////////////////////////////// +//! Store all non-empty intervals resulting from the subdivision of the interval +//! currently processed by the thread +//////////////////////////////////////////////////////////////////////////////// +template<typename NumericT> +__device__ +void +storeNonEmptyIntervalsLarge(unsigned int addr, + const unsigned int num_threads_active, + NumericT *s_left, NumericT *s_right, + unsigned short *s_left_count, + unsigned short *s_right_count, + NumericT left, NumericT mid, NumericT right, + const unsigned short left_count, + const unsigned short mid_count, + const unsigned short right_count, + NumericT epsilon, + unsigned int &compact_second_chunk, + unsigned short *s_compaction_list, + unsigned int &is_active_second) +{ + // check if both child intervals are valid + if ((left_count != mid_count) && (mid_count != right_count)) + { + + storeInterval(addr, s_left, s_right, s_left_count, s_right_count, + left, mid, left_count, mid_count, epsilon); + + is_active_second = 1; + s_compaction_list[threadIdx.x] = 1; + compact_second_chunk = 1; + } + else + { + + // only one non-empty child interval + + // mark that no second child + is_active_second = 0; + s_compaction_list[threadIdx.x] = 0; + + // store the one valid child interval + if (left_count != mid_count) + { + storeInterval(addr, s_left, s_right, s_left_count, s_right_count, + left, mid, left_count, mid_count, epsilon); + } + else + { + storeInterval(addr, s_left, s_right, s_left_count, s_right_count, + mid, right, mid_count, right_count, epsilon); + } + } +} + +/** @brief Bisection to find eigenvalues of a real, symmetric, and tridiagonal matrix +* g_d diagonal elements in global memory +* g_s superdiagonal elements in global elements (stored so that the element *(g_s - 1) can be accessed and equals 0 +* n size of matrix +* lg lower bound of input interval (e.g. Gerschgorin interval) +* ug upper bound of input interval (e.g. Gerschgorin interval) +* lg_eig_count number of eigenvalues that are smaller than lg +* lu_eig_count number of eigenvalues that are smaller than lu +* epsilon desired accuracy of eigenvalues to compute +*/ +template<typename NumericT> +__global__ +void +bisectKernelLarge(const NumericT *g_d, const NumericT *g_s, const unsigned int n, + const NumericT lg, const NumericT ug, + const unsigned int lg_eig_count, + const unsigned int ug_eig_count, + NumericT epsilon, + unsigned int *g_num_one, + unsigned int *g_num_blocks_mult, + NumericT *g_left_one, NumericT *g_right_one, + unsigned int *g_pos_one, + NumericT *g_left_mult, NumericT *g_right_mult, + unsigned int *g_left_count_mult, + unsigned int *g_right_count_mult, + unsigned int *g_blocks_mult, + unsigned int *g_blocks_mult_sum + ) +{ + const unsigned int tid = threadIdx.x; + + // intervals (store left and right because the subdivision tree is in general + // not dense + __shared__ NumericT s_left[2 * VIENNACL_BISECT_MAX_THREADS_BLOCK + 1]; + __shared__ NumericT s_right[2 * VIENNACL_BISECT_MAX_THREADS_BLOCK + 1]; + + // number of eigenvalues that are smaller than s_left / s_right + // (correspondence is realized via indices) + __shared__ unsigned short s_left_count[2 * VIENNACL_BISECT_MAX_THREADS_BLOCK + 1]; + __shared__ unsigned short s_right_count[2 * VIENNACL_BISECT_MAX_THREADS_BLOCK + 1]; + + // helper for stream compaction + __shared__ unsigned short s_compaction_list[2 * VIENNACL_BISECT_MAX_THREADS_BLOCK + 1]; + + // state variables for whole block + // if 0 then compaction of second chunk of child intervals is not necessary + // (because all intervals had exactly one non-dead child) + __shared__ unsigned int compact_second_chunk; + // if 1 then all threads are converged + __shared__ unsigned int all_threads_converged; + + // number of currently active threads + __shared__ unsigned int num_threads_active; + + // number of threads to use for stream compaction + __shared__ unsigned int num_threads_compaction; + + // helper for exclusive scan + unsigned short *s_compaction_list_exc = s_compaction_list + 1; + + + // variables for currently processed interval + // left and right limit of active interval + NumericT left = 0.0f; + NumericT right = 0.0f; + unsigned int left_count = 0; + unsigned int right_count = 0; + // midpoint of active interval + NumericT mid = 0.0f; + // number of eigenvalues smaller then mid + unsigned int mid_count = 0; + // helper for stream compaction (tracking of threads generating second child) + unsigned int is_active_second = 0; + + // initialize lists + s_compaction_list[tid] = 0; + s_left[tid] = 0; + s_right[tid] = 0; + s_left_count[tid] = 0; + s_right_count[tid] = 0; + + __syncthreads(); + + // set up initial configuration + if (0 == tid) + { + + s_left[0] = lg; + s_right[0] = ug; + s_left_count[0] = lg_eig_count; + s_right_count[0] = ug_eig_count; + + compact_second_chunk = 0; + num_threads_active = 1; + + num_threads_compaction = 1; + + all_threads_converged = 1; + } + + __syncthreads(); + + // for all active threads read intervals from the last level + // the number of (worst case) active threads per level l is 2^l + // determine coarse intervals. On these intervals the kernel for one or for multiple eigenvalues + // will be executed in the second step + while(true) + { + s_compaction_list[tid] = 0; + s_compaction_list[tid + VIENNACL_BISECT_MAX_THREADS_BLOCK] = 0; + s_compaction_list[2 * VIENNACL_BISECT_MAX_THREADS_BLOCK] = 0; + subdivideActiveInterval(tid, s_left, s_right, s_left_count, s_right_count, + num_threads_active, + left, right, left_count, right_count, + mid, all_threads_converged); + + __syncthreads(); + + // check if done + if (1 == all_threads_converged) + { + break; + } + + // compute number of eigenvalues smaller than mid + // use all threads for reading the necessary matrix data from global + // memory + // use s_left and s_right as scratch space for diagonal and + // superdiagonal of matrix + mid_count = computeNumSmallerEigenvalsLarge(g_d, g_s, n, + mid, threadIdx.x, + num_threads_active, + s_left, s_right, + (left == right)); + + __syncthreads(); + + // store intervals + // for all threads store the first child interval in a continuous chunk of + // memory, and the second child interval -- if it exists -- in a second + // chunk; it is likely that all threads reach convergence up to + // \a epsilon at the same level; furthermore, for higher level most / all + // threads will have only one child, storing the first child compactly will + // (first) avoid to perform a compaction step on the first chunk, (second) + // make it for higher levels (when all threads / intervals have + // exactly one child) unnecessary to perform a compaction of the second + // chunk + if (tid < num_threads_active) + { + + if (left != right) + { + + // store intervals + storeNonEmptyIntervalsLarge(tid, num_threads_active, + s_left, s_right, + s_left_count, s_right_count, + left, mid, right, + left_count, mid_count, right_count, + epsilon, compact_second_chunk, + s_compaction_list_exc, + is_active_second); + } + else + { + + // re-write converged interval (has to be stored again because s_left + // and s_right are used as scratch space for + // computeNumSmallerEigenvalsLarge() + s_left[tid] = left; + s_right[tid] = left; + s_left_count[tid] = left_count; + s_right_count[tid] = right_count; + + is_active_second = 0; + } + } + + // necessary so that compact_second_chunk is up-to-date + __syncthreads(); + + // perform compaction of chunk where second children are stored + // scan of (num_threads_active / 2) elements, thus at most + // (num_threads_active / 4) threads are needed + if (compact_second_chunk > 0) + { + + // create indices for compaction + createIndicesCompaction(s_compaction_list_exc, num_threads_compaction); + } + __syncthreads(); + + if (compact_second_chunk > 0) + { + compactIntervals(s_left, s_right, s_left_count, s_right_count, + mid, right, mid_count, right_count, + s_compaction_list, num_threads_active, + is_active_second); + } + + __syncthreads(); + + // update state variables + if (0 == tid) + { + + // update number of active threads with result of reduction + num_threads_active += s_compaction_list[num_threads_active]; + num_threads_compaction = ceilPow2(num_threads_active); + + compact_second_chunk = 0; + all_threads_converged = 1; + } + + __syncthreads(); + + if (num_threads_compaction > blockDim.x) + { + break; + } + + } + + __syncthreads(); + + // generate two lists of intervals; one with intervals that contain one + // eigenvalue (or are converged), and one with intervals that need further + // subdivision + + // perform two scans in parallel + + unsigned int left_count_2; + unsigned int right_count_2; + + unsigned int tid_2 = tid + blockDim.x; + + // cache in per thread registers so that s_left_count and s_right_count + // can be used for scans + left_count = s_left_count[tid]; + right_count = s_right_count[tid]; + + // some threads have to cache data for two intervals + if (tid_2 < num_threads_active) + { + left_count_2 = s_left_count[tid_2]; + right_count_2 = s_right_count[tid_2]; + } + + // compaction list for intervals containing one and multiple eigenvalues + // do not affect first element for exclusive scan + unsigned short *s_cl_one = s_left_count + 1; + unsigned short *s_cl_mult = s_right_count + 1; + + // compaction list for generating blocks of intervals containing multiple + // eigenvalues + unsigned short *s_cl_blocking = s_compaction_list_exc; + // helper compaction list for generating blocks of intervals + __shared__ unsigned short s_cl_helper[2 * VIENNACL_BISECT_MAX_THREADS_BLOCK + 1]; + + if (0 == tid) + { + // set to 0 for exclusive scan + s_left_count[0] = 0; + s_right_count[0] = 0; + + } + + __syncthreads(); + + // flag if interval contains one or multiple eigenvalues + unsigned int is_one_lambda = 0; + unsigned int is_one_lambda_2 = 0; + + // number of eigenvalues in the interval + unsigned int multiplicity = right_count - left_count; + is_one_lambda = (1 == multiplicity); + + s_cl_one[tid] = is_one_lambda; + s_cl_mult[tid] = (! is_one_lambda); + + // (note: s_cl_blocking is non-zero only where s_cl_mult[] is non-zero) + s_cl_blocking[tid] = (1 == is_one_lambda) ? 0 : multiplicity; + s_cl_helper[tid] = 0; + + if (tid_2 < num_threads_active) + { + + unsigned int multiplicity = right_count_2 - left_count_2; + is_one_lambda_2 = (1 == multiplicity); + + s_cl_one[tid_2] = is_one_lambda_2; + s_cl_mult[tid_2] = (! is_one_lambda_2); + + // (note: s_cl_blocking is non-zero only where s_cl_mult[] is non-zero) + s_cl_blocking[tid_2] = (1 == is_one_lambda_2) ? 0 : multiplicity; + s_cl_helper[tid_2] = 0; + } + else if (tid_2 < (2 * (n > 512 ? VIENNACL_BISECT_MAX_THREADS_BLOCK : VIENNACL_BISECT_MAX_THREADS_BLOCK / 2) + 1)) + { + + // clear + s_cl_blocking[tid_2] = 0; + s_cl_helper[tid_2] = 0; + } + + + scanInitial(tid, tid_2, n, num_threads_active, num_threads_compaction, + s_cl_one, s_cl_mult, s_cl_blocking, s_cl_helper); + + __syncthreads(); + + scanSumBlocks(tid, tid_2, num_threads_active, + num_threads_compaction, s_cl_blocking, s_cl_helper); + + // end down sweep of scan + __syncthreads(); + + unsigned int c_block_iend = 0; + unsigned int c_block_iend_2 = 0; + unsigned int c_sum_block = 0; + unsigned int c_sum_block_2 = 0; + + // for each thread / interval that corresponds to root node of interval block + // store start address of block and total number of eigenvalues in all blocks + // before this block (particular thread is irrelevant, constraint is to + // have a subset of threads so that one and only one of them is in each + // interval) + if (1 == s_cl_helper[tid]) + { + + c_block_iend = s_cl_mult[tid] + 1; + c_sum_block = s_cl_blocking[tid]; + } + + if (1 == s_cl_helper[tid_2]) + { + + c_block_iend_2 = s_cl_mult[tid_2] + 1; + c_sum_block_2 = s_cl_blocking[tid_2]; + } + + scanCompactBlocksStartAddress(tid, tid_2, num_threads_compaction, + s_cl_blocking, s_cl_helper); + + + // finished second scan for s_cl_blocking + __syncthreads(); + + // determine the global results + __shared__ unsigned int num_blocks_mult; + __shared__ unsigned int num_mult; + __shared__ unsigned int offset_mult_lambda; + + if (0 == tid) + { + + num_blocks_mult = s_cl_blocking[num_threads_active - 1]; + offset_mult_lambda = s_cl_one[num_threads_active - 1]; + num_mult = s_cl_mult[num_threads_active - 1]; + + *g_num_one = offset_mult_lambda; + *g_num_blocks_mult = num_blocks_mult; + } + + __syncthreads(); + + NumericT left_2, right_2; + --s_cl_one; + --s_cl_mult; + --s_cl_blocking; + + __syncthreads(); + compactStreamsFinal(tid, tid_2, num_threads_active, offset_mult_lambda, + s_left, s_right, s_left_count, s_right_count, + s_cl_one, s_cl_mult, s_cl_blocking, s_cl_helper, + is_one_lambda, is_one_lambda_2, + left, right, left_2, right_2, + left_count, right_count, left_count_2, right_count_2, + c_block_iend, c_sum_block, c_block_iend_2, c_sum_block_2 + ); + + __syncthreads(); + + // final adjustment before writing out data to global memory + if (0 == tid) + { + s_cl_blocking[num_blocks_mult] = num_mult; + s_cl_helper[0] = 0; + } + + __syncthreads(); + + // write to global memory + writeToGmem(tid, tid_2, num_threads_active, num_blocks_mult, + g_left_one, g_right_one, g_pos_one, + g_left_mult, g_right_mult, g_left_count_mult, g_right_count_mult, + s_left, s_right, s_left_count, s_right_count, + g_blocks_mult, g_blocks_mult_sum, + s_compaction_list, s_cl_helper, offset_mult_lambda); + +} +} +} +} +#endif // #ifndef _BISECT_KERNEL_LARGE_H_ http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/bisect_kernel_large_multi.hpp ---------------------------------------------------------------------- diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/bisect_kernel_large_multi.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/bisect_kernel_large_multi.hpp new file mode 100755 index 0000000..a670256 --- /dev/null +++ b/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/bisect_kernel_large_multi.hpp @@ -0,0 +1,277 @@ +#ifndef VIENNACL_LINALG_CUDA_BISECT_KERNEL_LARGE_MULTI_HPP_ +#define VIENNACL_LINALG_CUDA_BISECT_KERNEL_LARGE_MULTI_HPP_ + +/* ========================================================================= + Copyright (c) 2010-2016, Institute for Microelectronics, + Institute for Analysis and Scientific Computing, + TU Wien. + Portions of this software are copyright by UChicago Argonne, LLC. + + ----------------- + ViennaCL - The Vienna Computing Library + ----------------- + + Project Head: Karl Rupp [email protected] + + (A list of authors and contributors can be found in the manual) + + License: MIT (X11), see file LICENSE in the base directory +============================================================================= */ + + +/** @file viennacl/linalg/cuda/bisect_kernel_large_multi.hpp + @brief Second step of the bisection algorithm for the computation of eigenvalues for large matrices. + + Implementation based on the sample provided with the CUDA 6.0 SDK, for which + the creation of derivative works is allowed by including the following statement: + "This software contains source code provided by NVIDIA Corporation." +*/ + +/* Perform second step of bisection algorithm for large matrices for + * intervals that contained after the first step more than one eigenvalue + */ + +// includes, project +#include "viennacl/linalg/detail/bisect/config.hpp" +#include "viennacl/linalg/detail/bisect/util.hpp" +// additional kernel +#include "viennacl/linalg/cuda/bisect_util.hpp" + +namespace viennacl +{ +namespace linalg +{ +namespace cuda +{ +//////////////////////////////////////////////////////////////////////////////// +//! Perform second step of bisection algorithm for large matrices for +//! intervals that after the first step contained more than one eigenvalue +//! @param g_d diagonal elements of symmetric, tridiagonal matrix +//! @param g_s superdiagonal elements of symmetric, tridiagonal matrix +//! @param n matrix size +//! @param blocks_mult start addresses of blocks of intervals that are +//! processed by one block of threads, each of the +//! intervals contains more than one eigenvalue +//! @param blocks_mult_sum total number of eigenvalues / singleton intervals +//! in one block of intervals +//! @param g_left left limits of intervals +//! @param g_right right limits of intervals +//! @param g_left_count number of eigenvalues less than left limits +//! @param g_right_count number of eigenvalues less than right limits +//! @param g_lambda final eigenvalue +//! @param g_pos index of eigenvalue (in ascending order) +//! @param precision desired precision of eigenvalues +//////////////////////////////////////////////////////////////////////////////// +template<typename NumericT> +__global__ +void +bisectKernelLarge_MultIntervals(const NumericT *g_d, const NumericT *g_s, const unsigned int n, + unsigned int *blocks_mult, + unsigned int *blocks_mult_sum, + NumericT *g_left, NumericT *g_right, + unsigned int *g_left_count, + unsigned int *g_right_count, + NumericT *g_lambda, unsigned int *g_pos, + NumericT precision + ) +{ + const unsigned int tid = threadIdx.x; + + // left and right limits of interval + __shared__ NumericT s_left[2 * VIENNACL_BISECT_MAX_THREADS_BLOCK]; + __shared__ NumericT s_right[2 * VIENNACL_BISECT_MAX_THREADS_BLOCK]; + + // number of eigenvalues smaller than interval limits + __shared__ unsigned int s_left_count[2 * VIENNACL_BISECT_MAX_THREADS_BLOCK]; + __shared__ unsigned int s_right_count[2 * VIENNACL_BISECT_MAX_THREADS_BLOCK]; + + // helper array for chunk compaction of second chunk + __shared__ unsigned int s_compaction_list[2 * VIENNACL_BISECT_MAX_THREADS_BLOCK + 1]; + // compaction list helper for exclusive scan + unsigned int *s_compaction_list_exc = s_compaction_list + 1; + + // flag if all threads are converged + __shared__ unsigned int all_threads_converged; + // number of active threads + __shared__ unsigned int num_threads_active; + // number of threads to employ for compaction + __shared__ unsigned int num_threads_compaction; + // flag if second chunk has to be compacted + __shared__ unsigned int compact_second_chunk; + + // parameters of block of intervals processed by this block of threads + __shared__ unsigned int c_block_start; + __shared__ unsigned int c_block_end; + __shared__ unsigned int c_block_offset_output; + + // midpoint of currently active interval of the thread + NumericT mid = 0.0f; + // number of eigenvalues smaller than \a mid + unsigned int mid_count = 0; + // current interval parameter + NumericT left = 0.0f; + NumericT right = 0.0f; + unsigned int left_count = 0; + unsigned int right_count = 0; + // helper for compaction, keep track which threads have a second child + unsigned int is_active_second = 0; + + + __syncthreads(); + // initialize common start conditions + if (0 == tid) + { + + c_block_start = blocks_mult[blockIdx.x]; + c_block_end = blocks_mult[blockIdx.x + 1]; + c_block_offset_output = blocks_mult_sum[blockIdx.x]; + + + num_threads_active = c_block_end - c_block_start; + s_compaction_list[0] = 0; + num_threads_compaction = ceilPow2(num_threads_active); + + all_threads_converged = 1; + compact_second_chunk = 0; + } + + s_left_count [tid] = 42; + s_right_count[tid] = 42; + s_left_count [tid + VIENNACL_BISECT_MAX_THREADS_BLOCK] = 0; + s_right_count[tid + VIENNACL_BISECT_MAX_THREADS_BLOCK] = 0; + + __syncthreads(); + + + // read data into shared memory + if (tid < num_threads_active) + { + s_left[tid] = g_left[c_block_start + tid]; + s_right[tid] = g_right[c_block_start + tid]; + s_left_count[tid] = g_left_count[c_block_start + tid]; + s_right_count[tid] = g_right_count[c_block_start + tid]; + } + + __syncthreads(); + unsigned int iter = 0; + // do until all threads converged + while (true) + { + iter++; + //for (int iter=0; iter < 0; iter++) { + s_compaction_list[threadIdx.x] = 0; + s_compaction_list[threadIdx.x + blockDim.x] = 0; + s_compaction_list[2 * VIENNACL_BISECT_MAX_THREADS_BLOCK] = 0; + + // subdivide interval if currently active and not already converged + subdivideActiveIntervalMulti(tid, s_left, s_right, + s_left_count, s_right_count, + num_threads_active, + left, right, left_count, right_count, + mid, all_threads_converged); + __syncthreads(); + + // stop if all eigenvalues have been found + if (1 == all_threads_converged) + { + + break; + } + + // compute number of eigenvalues smaller than mid for active and not + // converged intervals, use all threads for loading data from gmem and + // s_left and s_right as scratch space to store the data load from gmem + // in shared memory + mid_count = computeNumSmallerEigenvalsLarge(g_d, g_s, n, + mid, tid, num_threads_active, + s_left, s_right, + (left == right)); + + __syncthreads(); + + if (tid < num_threads_active) + { + + // store intervals + if (left != right) + { + + storeNonEmptyIntervals(tid, num_threads_active, + s_left, s_right, s_left_count, s_right_count, + left, mid, right, + left_count, mid_count, right_count, + precision, compact_second_chunk, + s_compaction_list_exc, + is_active_second); + + } + else + { + + storeIntervalConverged(s_left, s_right, s_left_count, s_right_count, + left, mid, right, + left_count, mid_count, right_count, + s_compaction_list_exc, compact_second_chunk, + num_threads_active, + is_active_second); + + } + } + + __syncthreads(); + + // compact second chunk of intervals if any of the threads generated + // two child intervals + if (1 == compact_second_chunk) + { + + createIndicesCompaction(s_compaction_list_exc, num_threads_compaction); + compactIntervals(s_left, s_right, s_left_count, s_right_count, + mid, right, mid_count, right_count, + s_compaction_list, num_threads_active, + is_active_second); + } + + __syncthreads(); + + // update state variables + if (0 == tid) + { + num_threads_active += s_compaction_list[num_threads_active]; + num_threads_compaction = ceilPow2(num_threads_active); + + compact_second_chunk = 0; + all_threads_converged = 1; + } + + __syncthreads(); + + // clear + s_compaction_list_exc[threadIdx.x] = 0; + s_compaction_list_exc[threadIdx.x + blockDim.x] = 0; + + if (num_threads_compaction > blockDim.x) + { + break; + } + + + __syncthreads(); + + } // end until all threads converged + + // write data back to global memory + if (tid < num_threads_active) + { + + unsigned int addr = c_block_offset_output + tid; + + g_lambda[addr] = s_left[tid]; + g_pos[addr] = s_right_count[tid]; + } +} +} // namespace cuda +} // namespace linalg +} // namespace viennacl + +#endif // #ifndef VIENNACL_LINALG_CUDA_BISECT_KERNEL_LARGE_MULTI_HPP_
