http://git-wip-us.apache.org/repos/asf/mahout/blob/7ae549fa/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/amg_operations.hpp ---------------------------------------------------------------------- diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/amg_operations.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/amg_operations.hpp deleted file mode 100644 index 65b323e..0000000 --- a/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/amg_operations.hpp +++ /dev/null @@ -1,821 +0,0 @@ -#ifndef VIENNACL_LINALG_CUDA_AMG_OPERATIONS_HPP -#define VIENNACL_LINALG_CUDA_AMG_OPERATIONS_HPP - -/* ========================================================================= - Copyright (c) 2010-2016, Institute for Microelectronics, - Institute for Analysis and Scientific Computing, - TU Wien. - Portions of this software are copyright by UChicago Argonne, LLC. - - ----------------- - ViennaCL - The Vienna Computing Library - ----------------- - - Project Head: Karl Rupp [email protected] - - (A list of authors and contributors can be found in the PDF manual) - - License: MIT (X11), see file LICENSE in the base directory -============================================================================= */ - -/** @file cuda/amg_operations.hpp - @brief Implementations of routines for AMG in OpenCL. -*/ - -#include <cstdlib> -#include <cmath> -#include "viennacl/linalg/detail/amg/amg_base.hpp" - -#include <map> -#include <set> - -namespace viennacl -{ -namespace linalg -{ -namespace cuda -{ -namespace amg -{ - - -/////////////////////////////////////////// - -__global__ void amg_influence_trivial_kernel( - const unsigned int * row_indices, - const unsigned int * column_indices, - unsigned int size1, - unsigned int nnz, - unsigned int *influences_row, - unsigned int *influences_id, - unsigned int *influences_values - ) -{ - unsigned int global_id = blockDim.x * blockIdx.x + threadIdx.x; - unsigned int global_size = gridDim.x * blockDim.x; - - for (unsigned int i = global_id; i < size1; i += global_size) - { - unsigned int tmp = row_indices[i]; - influences_row[i] = tmp; - influences_values[i] = row_indices[i+1] - tmp; - } - - for (unsigned int i = global_id; i < nnz; i += global_size) - influences_id[i] = column_indices[i]; - - if (global_id == 0) - influences_row[size1] = row_indices[size1]; -} - - -/** @brief Routine for taking all connections in the matrix as strong */ -template<typename NumericT> -void amg_influence_trivial(compressed_matrix<NumericT> const & A, - viennacl::linalg::detail::amg::amg_level_context & amg_context, - viennacl::linalg::amg_tag & tag) -{ - (void)tag; - - amg_influence_trivial_kernel<<<128, 128>>>(viennacl::cuda_arg<unsigned int>(A.handle1().cuda_handle()), - viennacl::cuda_arg<unsigned int>(A.handle2().cuda_handle()), - static_cast<unsigned int>(A.size1()), - static_cast<unsigned int>(A.nnz()), - viennacl::cuda_arg(amg_context.influence_jumper_), - viennacl::cuda_arg(amg_context.influence_ids_), - viennacl::cuda_arg(amg_context.influence_values_) - ); - VIENNACL_CUDA_LAST_ERROR_CHECK("amg_influence_trivial_kernel"); -} - - -/** @brief Routine for extracting strongly connected points considering a user-provided threshold value */ -template<typename NumericT> -void amg_influence_advanced(compressed_matrix<NumericT> const & A, - viennacl::linalg::detail::amg::amg_level_context & amg_context, - viennacl::linalg::amg_tag & tag) -{ - throw std::runtime_error("not implemented yet"); -} - -/** @brief Dispatcher for influence processing */ -template<typename NumericT> -void amg_influence(compressed_matrix<NumericT> const & A, - viennacl::linalg::detail::amg::amg_level_context & amg_context, - viennacl::linalg::amg_tag & tag) -{ - // TODO: dispatch based on influence tolerance provided - amg_influence_trivial(A, amg_context, tag); -} - -/** @brief Assign IDs to coarse points. -* -* TODO: Use exclusive_scan on GPU for this. -*/ -inline void enumerate_coarse_points(viennacl::linalg::detail::amg::amg_level_context & amg_context) -{ - viennacl::backend::typesafe_host_array<unsigned int> point_types(amg_context.point_types_.handle(), amg_context.point_types_.size()); - viennacl::backend::typesafe_host_array<unsigned int> coarse_ids(amg_context.coarse_id_.handle(), amg_context.coarse_id_.size()); - viennacl::backend::memory_read(amg_context.point_types_.handle(), 0, point_types.raw_size(), point_types.get()); - viennacl::backend::memory_read(amg_context.coarse_id_.handle(), 0, coarse_ids.raw_size(), coarse_ids.get()); - - unsigned int coarse_id = 0; - for (std::size_t i=0; i<amg_context.point_types_.size(); ++i) - { - coarse_ids.set(i, coarse_id); - if (point_types[i] == viennacl::linalg::detail::amg::amg_level_context::POINT_TYPE_COARSE) - ++coarse_id; - } - - amg_context.num_coarse_ = coarse_id; - - viennacl::backend::memory_write(amg_context.coarse_id_.handle(), 0, coarse_ids.raw_size(), coarse_ids.get()); -} - -////////////////////////////////////// - -/** @brief CUDA kernel initializing the work vectors at each PMIS iteration */ -template<typename IndexT> -__global__ void amg_pmis2_init_workdata(IndexT *work_state, - IndexT *work_random, - IndexT *work_index, - IndexT const *point_types, - IndexT const *random_weights, - unsigned int size) -{ - unsigned int global_id = blockDim.x * blockIdx.x + threadIdx.x; - unsigned int global_size = gridDim.x * blockDim.x; - - for (unsigned int i = global_id; i < size; i += global_size) - { - switch (point_types[i]) - { - case viennacl::linalg::detail::amg::amg_level_context::POINT_TYPE_UNDECIDED: work_state[i] = 1; break; - case viennacl::linalg::detail::amg::amg_level_context::POINT_TYPE_FINE: work_state[i] = 0; break; - case viennacl::linalg::detail::amg::amg_level_context::POINT_TYPE_COARSE: work_state[i] = 2; break; - default: - break; // do nothing - } - - work_random[i] = random_weights[i]; - work_index[i] = i; - } -} - -/** @brief CUDA kernel propagating the state triple (status, weight, nodeID) to neighbors using a max()-operation */ -template<typename IndexT> -__global__ void amg_pmis2_max_neighborhood(IndexT const *work_state, - IndexT const *work_random, - IndexT const *work_index, - IndexT *work_state2, - IndexT *work_random2, - IndexT *work_index2, - IndexT const *influences_row, - IndexT const *influences_id, - unsigned int size) -{ - unsigned int global_id = blockDim.x * blockIdx.x + threadIdx.x; - unsigned int global_size = gridDim.x * blockDim.x; - - for (unsigned int i = global_id; i < size; i += global_size) - { - // load - unsigned int state = work_state[i]; - unsigned int random = work_random[i]; - unsigned int index = work_index[i]; - - // max - unsigned int j_stop = influences_row[i + 1]; - for (unsigned int j = influences_row[i]; j < j_stop; ++j) - { - unsigned int influenced_point_id = influences_id[j]; - - // lexigraphical triple-max (not particularly pretty, but does the job): - if (state < work_state[influenced_point_id]) - { - state = work_state[influenced_point_id]; - random = work_random[influenced_point_id]; - index = work_index[influenced_point_id]; - } - else if (state == work_state[influenced_point_id]) - { - if (random < work_random[influenced_point_id]) - { - state = work_state[influenced_point_id]; - random = work_random[influenced_point_id]; - index = work_index[influenced_point_id]; - } - else if (random == work_random[influenced_point_id]) - { - if (index < work_index[influenced_point_id]) - { - state = work_state[influenced_point_id]; - random = work_random[influenced_point_id]; - index = work_index[influenced_point_id]; - } - } // max(random) - } // max(state) - } // for - - // store - work_state2[i] = state; - work_random2[i] = random; - work_index2[i] = index; - } -} - -/** @brief CUDA kernel for marking MIS and non-MIS nodes */ -template<typename IndexT> -__global__ void amg_pmis2_mark_mis_nodes(IndexT const *work_state, - IndexT const *work_index, - IndexT *point_types, - IndexT *undecided_buffer, - unsigned int size) -{ - unsigned int global_id = blockDim.x * blockIdx.x + threadIdx.x; - unsigned int global_size = gridDim.x * blockDim.x; - - unsigned int num_undecided = 0; - for (unsigned int i = global_id; i < size; i += global_size) - { - unsigned int max_state = work_state[i]; - unsigned int max_index = work_index[i]; - - if (point_types[i] == viennacl::linalg::detail::amg::amg_level_context::POINT_TYPE_UNDECIDED) - { - if (i == max_index) // make this a MIS node - point_types[i] = viennacl::linalg::detail::amg::amg_level_context::POINT_TYPE_COARSE; - else if (max_state == 2) // mind the mapping of viennacl::linalg::detail::amg::amg_level_context::POINT_TYPE_COARSE above! - point_types[i] = viennacl::linalg::detail::amg::amg_level_context::POINT_TYPE_FINE; - else - num_undecided += 1; - } - } - - // reduction of the number of undecided nodes: - __shared__ unsigned int shared_buffer[256]; - shared_buffer[threadIdx.x] = num_undecided; - for (unsigned int stride = blockDim.x/2; stride > 0; stride /= 2) - { - __syncthreads(); - if (threadIdx.x < stride) - shared_buffer[threadIdx.x] += shared_buffer[threadIdx.x+stride]; - } - - if (threadIdx.x == 0) - undecided_buffer[blockIdx.x] = shared_buffer[0]; - -} - -/** @brief CUDA kernel for resetting non-MIS (i.e. coarse) points to undecided so that subsequent kernels work */ -__global__ void amg_pmis2_reset_state(unsigned int *point_types, unsigned int size) -{ - unsigned int global_id = blockDim.x * blockIdx.x + threadIdx.x; - unsigned int global_size = gridDim.x * blockDim.x; - - for (unsigned int i = global_id; i < size; i += global_size) - { - if (point_types[i] != viennacl::linalg::detail::amg::amg_level_context::POINT_TYPE_COARSE) - point_types[i] = viennacl::linalg::detail::amg::amg_level_context::POINT_TYPE_UNDECIDED; - } -} - -/** @brief AG (aggregation based) coarsening, single-threaded version of stage 1 -* -* @param A Operator matrix on all levels -* @param amg_context AMG hierarchy datastructures -* @param tag AMG preconditioner tag -*/ -template<typename NumericT> -void amg_coarse_ag_stage1_mis2(compressed_matrix<NumericT> const & A, - viennacl::linalg::detail::amg::amg_level_context & amg_context, - viennacl::linalg::amg_tag & tag) -{ - viennacl::vector<unsigned int> random_weights(A.size1(), viennacl::context(viennacl::MAIN_MEMORY)); - unsigned int *random_weights_ptr = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(random_weights.handle()); - for (std::size_t i=0; i<random_weights.size(); ++i) - random_weights_ptr[i] = static_cast<unsigned int>(rand()) % static_cast<unsigned int>(A.size1()); - random_weights.switch_memory_context(viennacl::traits::context(A)); - - // work vectors: - viennacl::vector<unsigned int> work_state(A.size1(), viennacl::traits::context(A)); - viennacl::vector<unsigned int> work_random(A.size1(), viennacl::traits::context(A)); - viennacl::vector<unsigned int> work_index(A.size1(), viennacl::traits::context(A)); - - viennacl::vector<unsigned int> work_state2(A.size1(), viennacl::traits::context(A)); - viennacl::vector<unsigned int> work_random2(A.size1(), viennacl::traits::context(A)); - viennacl::vector<unsigned int> work_index2(A.size1(), viennacl::traits::context(A)); - - unsigned int num_undecided = static_cast<unsigned int>(A.size1()); - viennacl::vector<unsigned int> undecided_buffer(256, viennacl::traits::context(A)); - viennacl::backend::typesafe_host_array<unsigned int> undecided_buffer_host(undecided_buffer.handle(), undecided_buffer.size()); - - unsigned int pmis_iters = 0; - while (num_undecided > 0) - { - ++pmis_iters; - - // - // init temporary work data: - // - amg_pmis2_init_workdata<<<128, 128>>>(viennacl::cuda_arg(work_state), - viennacl::cuda_arg(work_random), - viennacl::cuda_arg(work_index), - viennacl::cuda_arg(amg_context.point_types_), - viennacl::cuda_arg(random_weights), - static_cast<unsigned int>(A.size1()) - ); - VIENNACL_CUDA_LAST_ERROR_CHECK("amg_pmis2_reset_state"); - - - // - // Propagate maximum tuple twice - // - for (unsigned int r = 0; r < 2; ++r) - { - // max operation over neighborhood - amg_pmis2_max_neighborhood<<<128, 128>>>(viennacl::cuda_arg(work_state), - viennacl::cuda_arg(work_random), - viennacl::cuda_arg(work_index), - viennacl::cuda_arg(work_state2), - viennacl::cuda_arg(work_random2), - viennacl::cuda_arg(work_index2), - viennacl::cuda_arg(amg_context.influence_jumper_), - viennacl::cuda_arg(amg_context.influence_ids_), - static_cast<unsigned int>(A.size1()) - ); - VIENNACL_CUDA_LAST_ERROR_CHECK("amg_pmis2_max_neighborhood"); - - // copy work array (can be fused into a single kernel if needed. Previous kernel is in most cases sufficiently heavy) - work_state = work_state2; - work_random = work_random2; - work_index = work_index2; - } - - // - // mark MIS and non-MIS nodes: - // - amg_pmis2_mark_mis_nodes<<<128, 128>>>(viennacl::cuda_arg(work_state), - viennacl::cuda_arg(work_index), - viennacl::cuda_arg(amg_context.point_types_), - viennacl::cuda_arg(undecided_buffer), - static_cast<unsigned int>(A.size1()) - ); - VIENNACL_CUDA_LAST_ERROR_CHECK("amg_pmis2_reset_state"); - - // get number of undecided points on host: - viennacl::backend::memory_read(undecided_buffer.handle(), 0, undecided_buffer_host.raw_size(), undecided_buffer_host.get()); - num_undecided = 0; - for (std::size_t i=0; i<undecided_buffer.size(); ++i) - num_undecided += undecided_buffer_host[i]; - - } //while - - // consistency with sequential MIS: reset state for non-coarse points, so that coarse indices are correctly picked up later - amg_pmis2_reset_state<<<128, 128>>>(viennacl::cuda_arg(amg_context.point_types_), - static_cast<unsigned int>(amg_context.point_types_.size()) - ); - VIENNACL_CUDA_LAST_ERROR_CHECK("amg_pmis2_reset_state"); -} - - - - - -template<typename IndexT> -__global__ void amg_agg_propagate_coarse_indices(IndexT *point_types, - IndexT *coarse_ids, - IndexT const *influences_row, - IndexT const *influences_id, - unsigned int size) -{ - unsigned int global_id = blockDim.x * blockIdx.x + threadIdx.x; - unsigned int global_size = gridDim.x * blockDim.x; - - for (unsigned int i = global_id; i < size; i += global_size) - { - if (point_types[i] == viennacl::linalg::detail::amg::amg_level_context::POINT_TYPE_COARSE) - { - unsigned int coarse_index = coarse_ids[i]; - - unsigned int j_stop = influences_row[i + 1]; - for (unsigned int j = influences_row[i]; j < j_stop; ++j) - { - unsigned int influenced_point_id = influences_id[j]; - coarse_ids[influenced_point_id] = coarse_index; // Set aggregate index for fine point - - if (influenced_point_id != i) // Note: Any write races between threads are harmless here - point_types[influenced_point_id] = viennacl::linalg::detail::amg::amg_level_context::POINT_TYPE_FINE; - } - } - } -} - - -template<typename IndexT> -__global__ void amg_agg_merge_undecided(IndexT *point_types, - IndexT *coarse_ids, - IndexT const *influences_row, - IndexT const *influences_id, - unsigned int size) -{ - unsigned int global_id = blockDim.x * blockIdx.x + threadIdx.x; - unsigned int global_size = gridDim.x * blockDim.x; - - for (unsigned int i = global_id; i < size; i += global_size) - { - if (point_types[i] == viennacl::linalg::detail::amg::amg_level_context::POINT_TYPE_UNDECIDED) - { - unsigned int j_stop = influences_row[i + 1]; - for (unsigned int j = influences_row[i]; j < j_stop; ++j) - { - unsigned int influenced_point_id = influences_id[j]; - if (point_types[influenced_point_id] != viennacl::linalg::detail::amg::amg_level_context::POINT_TYPE_UNDECIDED) // either coarse or fine point - { - //std::cout << "Setting fine node " << i << " to be aggregated with node " << *influence_iter << "/" << pointvector.get_coarse_index(*influence_iter) << std::endl; - coarse_ids[i] = coarse_ids[influenced_point_id]; - break; - } - } - } - } -} - - -__global__ void amg_agg_merge_undecided_2(unsigned int *point_types, - unsigned int size) -{ - unsigned int global_id = blockDim.x * blockIdx.x + threadIdx.x; - unsigned int global_size = gridDim.x * blockDim.x; - - for (unsigned int i = global_id; i < size; i += global_size) - { - if (point_types[i] == viennacl::linalg::detail::amg::amg_level_context::POINT_TYPE_UNDECIDED) - point_types[i] = viennacl::linalg::detail::amg::amg_level_context::POINT_TYPE_FINE; - } -} - - -/** @brief AG (aggregation based) coarsening. Partially single-threaded version (VIENNACL_AMG_COARSE_AG) -* -* @param A Operator matrix -* @param amg_context AMG hierarchy datastructures -* @param tag AMG preconditioner tag -*/ -template<typename NumericT> -void amg_coarse_ag(compressed_matrix<NumericT> const & A, - viennacl::linalg::detail::amg::amg_level_context & amg_context, - viennacl::linalg::amg_tag & tag) -{ - - amg_influence_trivial(A, amg_context, tag); - - // - // Stage 1: Build aggregates: - // - if (tag.get_coarsening_method() == viennacl::linalg::AMG_COARSENING_METHOD_MIS2_AGGREGATION) - amg_coarse_ag_stage1_mis2(A, amg_context, tag); - else - throw std::runtime_error("Only MIS2 coarsening implemented. Selected coarsening not available with CUDA backend!"); - - viennacl::linalg::cuda::amg::enumerate_coarse_points(amg_context); - - // - // Stage 2: Propagate coarse aggregate indices to neighbors: - // - amg_agg_propagate_coarse_indices<<<128, 128>>>(viennacl::cuda_arg(amg_context.point_types_), - viennacl::cuda_arg(amg_context.coarse_id_), - viennacl::cuda_arg(amg_context.influence_jumper_), - viennacl::cuda_arg(amg_context.influence_ids_), - static_cast<unsigned int>(A.size1()) - ); - VIENNACL_CUDA_LAST_ERROR_CHECK("amg_agg_propagate_coarse_indices"); - - - // - // Stage 3: Merge remaining undecided points (merging to first aggregate found when cycling over the hierarchy - // - amg_agg_merge_undecided<<<128, 128>>>(viennacl::cuda_arg(amg_context.point_types_), - viennacl::cuda_arg(amg_context.coarse_id_), - viennacl::cuda_arg(amg_context.influence_jumper_), - viennacl::cuda_arg(amg_context.influence_ids_), - static_cast<unsigned int>(A.size1()) - ); - VIENNACL_CUDA_LAST_ERROR_CHECK("amg_agg_merge_undecided"); - - // - // Stage 4: Set undecided points to fine points (coarse ID already set in Stage 3) - // Note: Stage 3 and Stage 4 were initially fused, but are now split in order to avoid race conditions (or a fallback to sequential execution). - // - amg_agg_merge_undecided_2<<<128, 128>>>(viennacl::cuda_arg(amg_context.point_types_), - static_cast<unsigned int>(A.size1()) - ); - VIENNACL_CUDA_LAST_ERROR_CHECK("amg_agg_merge_undecided_2"); -} - - - - -/** @brief Calls the right coarsening procedure -* -* @param A Operator matrix on all levels -* @param amg_context AMG hierarchy datastructures -* @param tag AMG preconditioner tag -*/ -template<typename InternalT1> -void amg_coarse(InternalT1 & A, - viennacl::linalg::detail::amg::amg_level_context & amg_context, - viennacl::linalg::amg_tag & tag) -{ - switch (tag.get_coarsening_method()) - { - case viennacl::linalg::AMG_COARSENING_METHOD_MIS2_AGGREGATION: amg_coarse_ag(A, amg_context, tag); break; - default: throw std::runtime_error("not implemented yet"); - } -} - - - - -////////////////////////////////////// Interpolation ///////////////////////////// - -template<typename NumericT> -__global__ void amg_interpol_ag_kernel(unsigned int *P_row_buffer, - unsigned int *P_col_buffer, - NumericT *P_elements, - unsigned int *coarse_ids, - unsigned int size) -{ - unsigned int global_id = blockDim.x * blockIdx.x + threadIdx.x; - unsigned int global_size = gridDim.x * blockDim.x; - - for (unsigned int i = global_id; i < size; i += global_size) - { - P_row_buffer[i] = i; - P_col_buffer[i] = coarse_ids[i]; - P_elements[i] = NumericT(1); - } - - // set last entry as well: - if (global_id == 0) - P_row_buffer[size] = size; -} - -/** @brief AG (aggregation based) interpolation. Multi-Threaded! (VIENNACL_INTERPOL_SA) - * - * @param A Operator matrix - * @param P Prolongation matrix - * @param amg_context AMG hierarchy datastructures - * @param tag AMG configuration tag -*/ -template<typename NumericT> -void amg_interpol_ag(compressed_matrix<NumericT> const & A, - compressed_matrix<NumericT> & P, - viennacl::linalg::detail::amg::amg_level_context & amg_context, - viennacl::linalg::amg_tag & tag) -{ - (void)tag; - P = compressed_matrix<NumericT>(A.size1(), amg_context.num_coarse_, A.size1(), viennacl::traits::context(A)); - - amg_interpol_ag_kernel<<<128, 128>>>(viennacl::cuda_arg<unsigned int>(P.handle1().cuda_handle()), - viennacl::cuda_arg<unsigned int>(P.handle2().cuda_handle()), - viennacl::cuda_arg<NumericT>(P.handle().cuda_handle()), - viennacl::cuda_arg(amg_context.coarse_id_), - static_cast<unsigned int>(A.size1()) - ); - VIENNACL_CUDA_LAST_ERROR_CHECK("amg_interpol_ag_kernel"); - - P.generate_row_block_information(); -} - - - -template<typename NumericT> -__global__ void amg_interpol_sa_kernel( - const unsigned int *A_row_indices, - const unsigned int *A_col_indices, - const NumericT *A_elements, - unsigned int A_size1, - unsigned int A_nnz, - unsigned int *Jacobi_row_indices, - unsigned int *Jacobi_col_indices, - NumericT *Jacobi_elements, - NumericT omega - ) -{ - unsigned int global_id = blockDim.x * blockIdx.x + threadIdx.x; - unsigned int global_size = gridDim.x * blockDim.x; - - for (unsigned int row = global_id; row < A_size1; row += global_size) - { - unsigned int row_begin = A_row_indices[row]; - unsigned int row_end = A_row_indices[row+1]; - - Jacobi_row_indices[row] = row_begin; - - // Step 1: Extract diagonal: - NumericT diag = 0; - for (unsigned int j = row_begin; j < row_end; ++j) - { - if (A_col_indices[j] == row) - { - diag = A_elements[j]; - break; - } - } - - // Step 2: Write entries: - for (unsigned int j = row_begin; j < row_end; ++j) - { - unsigned int col_index = A_col_indices[j]; - Jacobi_col_indices[j] = col_index; - - if (col_index == row) - Jacobi_elements[j] = NumericT(1) - omega; - else - Jacobi_elements[j] = - omega * A_elements[j] / diag; - } - } - - if (global_id == 0) - Jacobi_row_indices[A_size1] = A_nnz; // don't forget finalizer -} - - - -/** @brief Smoothed aggregation interpolation. (VIENNACL_INTERPOL_SA) - * - * @param A Operator matrix - * @param P Prolongation matrix - * @param amg_context AMG hierarchy datastructures - * @param tag AMG configuration tag -*/ -template<typename NumericT> -void amg_interpol_sa(compressed_matrix<NumericT> const & A, - compressed_matrix<NumericT> & P, - viennacl::linalg::detail::amg::amg_level_context & amg_context, - viennacl::linalg::amg_tag & tag) -{ - (void)tag; - viennacl::compressed_matrix<NumericT> P_tentative(A.size1(), amg_context.num_coarse_, A.size1(), viennacl::traits::context(A)); - - // form tentative operator: - amg_interpol_ag(A, P_tentative, amg_context, tag); - - viennacl::compressed_matrix<NumericT> Jacobi(A.size1(), A.size1(), A.nnz(), viennacl::traits::context(A)); - - amg_interpol_sa_kernel<<<128, 128>>>(viennacl::cuda_arg<unsigned int>(A.handle1().cuda_handle()), - viennacl::cuda_arg<unsigned int>(A.handle2().cuda_handle()), - viennacl::cuda_arg<NumericT>(A.handle().cuda_handle()), - static_cast<unsigned int>(A.size1()), - static_cast<unsigned int>(A.nnz()), - viennacl::cuda_arg<unsigned int>(Jacobi.handle1().cuda_handle()), - viennacl::cuda_arg<unsigned int>(Jacobi.handle2().cuda_handle()), - viennacl::cuda_arg<NumericT>(Jacobi.handle().cuda_handle()), - NumericT(tag.get_jacobi_weight()) - ); - VIENNACL_CUDA_LAST_ERROR_CHECK("amg_interpol_sa_kernel"); - - P = viennacl::linalg::prod(Jacobi, P_tentative); - - P.generate_row_block_information(); -} - - -/** @brief Dispatcher for building the interpolation matrix - * - * @param A Operator matrix - * @param P Prolongation matrix - * @param amg_context AMG hierarchy datastructures - * @param tag AMG configuration tag -*/ -template<typename MatrixT> -void amg_interpol(MatrixT const & A, - MatrixT & P, - viennacl::linalg::detail::amg::amg_level_context & amg_context, - viennacl::linalg::amg_tag & tag) -{ - switch (tag.get_interpolation_method()) - { - case viennacl::linalg::AMG_INTERPOLATION_METHOD_AGGREGATION: amg_interpol_ag (A, P, amg_context, tag); break; - case viennacl::linalg::AMG_INTERPOLATION_METHOD_SMOOTHED_AGGREGATION: amg_interpol_sa (A, P, amg_context, tag); break; - default: throw std::runtime_error("Not implemented yet!"); - } -} - - -template<typename NumericT> -__global__ void compressed_matrix_assign_to_dense( - const unsigned int * row_indices, - const unsigned int * column_indices, - const NumericT * elements, - NumericT *B, - unsigned int B_row_start, unsigned int B_col_start, - unsigned int B_row_inc, unsigned int B_col_inc, - unsigned int B_row_size, unsigned int B_col_size, - unsigned int B_internal_rows, unsigned int B_internal_cols) -{ - for (unsigned int row = blockDim.x * blockIdx.x + threadIdx.x; - row < B_row_size; - row += gridDim.x * blockDim.x) - { - unsigned int row_end = row_indices[row+1]; - for (unsigned int j = row_indices[row]; j<row_end; j++) - B[(B_row_start + row * B_row_inc) * B_internal_cols + B_col_start + column_indices[j] * B_col_inc] = elements[j]; - } -} - - -template<typename NumericT, unsigned int AlignmentV> -void assign_to_dense(viennacl::compressed_matrix<NumericT, AlignmentV> const & A, - viennacl::matrix_base<NumericT> & B) -{ - compressed_matrix_assign_to_dense<<<128, 128>>>(viennacl::cuda_arg<unsigned int>(A.handle1().cuda_handle()), - viennacl::cuda_arg<unsigned int>(A.handle2().cuda_handle()), - viennacl::cuda_arg<NumericT>(A.handle().cuda_handle()), - viennacl::cuda_arg<NumericT>(B), - static_cast<unsigned int>(viennacl::traits::start1(B)), static_cast<unsigned int>(viennacl::traits::start2(B)), - static_cast<unsigned int>(viennacl::traits::stride1(B)), static_cast<unsigned int>(viennacl::traits::stride2(B)), - static_cast<unsigned int>(viennacl::traits::size1(B)), static_cast<unsigned int>(viennacl::traits::size2(B)), - static_cast<unsigned int>(viennacl::traits::internal_size1(B)), static_cast<unsigned int>(viennacl::traits::internal_size2(B)) - ); - VIENNACL_CUDA_LAST_ERROR_CHECK("compressed_matrix_assign_to_dense"); -} - - - - -template<typename NumericT> -__global__ void compressed_matrix_smooth_jacobi_kernel( - const unsigned int * row_indices, - const unsigned int * column_indices, - const NumericT * elements, - NumericT weight, - const NumericT * x_old, - NumericT * x_new, - const NumericT * rhs, - unsigned int size) -{ - for (unsigned int row = blockDim.x * blockIdx.x + threadIdx.x; - row < size; - row += gridDim.x * blockDim.x) - { - NumericT sum = NumericT(0); - NumericT diag = NumericT(1); - unsigned int row_end = row_indices[row+1]; - for (unsigned int j = row_indices[row]; j < row_end; ++j) - { - unsigned int col = column_indices[j]; - if (col == row) - diag = elements[j]; - else - sum += elements[j] * x_old[col]; - } - x_new[row] = weight * (rhs[row] - sum) / diag + (NumericT(1) - weight) * x_old[row]; - } -} - - - - -/** @brief Damped Jacobi Smoother (CUDA version) -* -* @param iterations Number of smoother iterations -* @param A Operator matrix for the smoothing -* @param x The vector smoothing is applied to -* @param x_backup (Different) Vector holding the same values as x -* @param rhs_smooth The right hand side of the equation for the smoother -* @param weight Damping factor. 0: No effect of smoother. 1: Undamped Jacobi iteration -*/ -template<typename NumericT> -void smooth_jacobi(unsigned int iterations, - compressed_matrix<NumericT> const & A, - vector<NumericT> & x, - vector<NumericT> & x_backup, - vector<NumericT> const & rhs_smooth, - NumericT weight) -{ - for (unsigned int i=0; i<iterations; ++i) - { - x_backup = x; - - compressed_matrix_smooth_jacobi_kernel<<<128, 128>>>(viennacl::cuda_arg<unsigned int>(A.handle1().cuda_handle()), - viennacl::cuda_arg<unsigned int>(A.handle2().cuda_handle()), - viennacl::cuda_arg<NumericT>(A.handle().cuda_handle()), - static_cast<NumericT>(weight), - viennacl::cuda_arg(x_backup), - viennacl::cuda_arg(x), - viennacl::cuda_arg(rhs_smooth), - static_cast<unsigned int>(rhs_smooth.size()) - ); - VIENNACL_CUDA_LAST_ERROR_CHECK("compressed_matrix_smooth_jacobi_kernel"); - } -} - - -} //namespace amg -} //namespace host_based -} //namespace linalg -} //namespace viennacl - -#endif
http://git-wip-us.apache.org/repos/asf/mahout/blob/7ae549fa/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/bisect_kernel_calls.hpp ---------------------------------------------------------------------- diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/bisect_kernel_calls.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/bisect_kernel_calls.hpp deleted file mode 100644 index 39f0015..0000000 --- a/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/bisect_kernel_calls.hpp +++ /dev/null @@ -1,166 +0,0 @@ -#ifndef VIENNACL_LINALG_CUDA_BISECT_KERNEL_CALLS_HPP_ -#define VIENNACL_LINALG_CUDA_BISECT_KERNEL_CALLS_HPP_ - -/* ========================================================================= - Copyright (c) 2010-2016, Institute for Microelectronics, - Institute for Analysis and Scientific Computing, - TU Wien. - Portions of this software are copyright by UChicago Argonne, LLC. - ----------------- - ViennaCL - The Vienna Computing Library - ----------------- - - Project Head: Karl Rupp [email protected] - - (A list of authors and contributors can be found in the manual) - - License: MIT (X11), see file LICENSE in the base directory -============================================================================= */ - - -/** @file viennacl/linalg/cuda/bisect_kernel_calls.hpp - @brief CUDA kernel calls for the bisection algorithm - - Implementation based on the sample provided with the CUDA 6.0 SDK, for which - the creation of derivative works is allowed by including the following statement: - "This software contains source code provided by NVIDIA Corporation." -*/ - -#include "viennacl/linalg/detail/bisect/structs.hpp" - -// includes, kernels -#include "viennacl/linalg/cuda/bisect_kernel_small.hpp" -#include "viennacl/linalg/cuda/bisect_kernel_large.hpp" -#include "viennacl/linalg/cuda/bisect_kernel_large_onei.hpp" -#include "viennacl/linalg/cuda/bisect_kernel_large_multi.hpp" - - -namespace viennacl -{ -namespace linalg -{ -namespace cuda -{ -template<typename NumericT> -void bisectSmall(const viennacl::linalg::detail::InputData<NumericT> &input, viennacl::linalg::detail::ResultDataSmall<NumericT> &result, - const unsigned int mat_size, - const NumericT lg, const NumericT ug, - const NumericT precision) -{ - - - dim3 blocks(1, 1, 1); - dim3 threads(VIENNACL_BISECT_MAX_THREADS_BLOCK_SMALL_MATRIX, 1, 1); - - bisectKernelSmall<<< blocks, threads >>>( - viennacl::cuda_arg(input.g_a), - viennacl::cuda_arg(input.g_b) + 1, - mat_size, - viennacl::cuda_arg(result.vcl_g_left), - viennacl::cuda_arg(result.vcl_g_right), - viennacl::cuda_arg(result.vcl_g_left_count), - viennacl::cuda_arg(result.vcl_g_right_count), - lg, ug, 0, mat_size, - precision - ); - viennacl::linalg::cuda::VIENNACL_CUDA_LAST_ERROR_CHECK("Kernel launch failed"); -} - - -template<typename NumericT> -void bisectLarge(const viennacl::linalg::detail::InputData<NumericT> &input, viennacl::linalg::detail::ResultDataLarge<NumericT> &result, - const unsigned int mat_size, - const NumericT lg, const NumericT ug, - const NumericT precision) - { - - dim3 blocks(1, 1, 1); - dim3 threads(mat_size > 512 ? VIENNACL_BISECT_MAX_THREADS_BLOCK : VIENNACL_BISECT_MAX_THREADS_BLOCK / 2 , 1, 1); - bisectKernelLarge<<< blocks, threads >>> - (viennacl::cuda_arg(input.g_a), - viennacl::cuda_arg(input.g_b) + 1, - mat_size, - lg, ug, static_cast<unsigned int>(0), mat_size, precision, - viennacl::cuda_arg(result.g_num_one), - viennacl::cuda_arg(result.g_num_blocks_mult), - viennacl::cuda_arg(result.g_left_one), - viennacl::cuda_arg(result.g_right_one), - viennacl::cuda_arg(result.g_pos_one), - viennacl::cuda_arg(result.g_left_mult), - viennacl::cuda_arg(result.g_right_mult), - viennacl::cuda_arg(result.g_left_count_mult), - viennacl::cuda_arg(result.g_right_count_mult), - viennacl::cuda_arg(result.g_blocks_mult), - viennacl::cuda_arg(result.g_blocks_mult_sum) - ); - viennacl::linalg::cuda::VIENNACL_CUDA_LAST_ERROR_CHECK("Kernel launch failed."); -} - - -// compute eigenvalues for intervals that contained only one eigenvalue -// after the first processing step -template<typename NumericT> -void bisectLarge_OneIntervals(const viennacl::linalg::detail::InputData<NumericT> &input, viennacl::linalg::detail::ResultDataLarge<NumericT> &result, - const unsigned int mat_size, - const NumericT precision) - { - - unsigned int num_one_intervals = result.g_num_one; - unsigned int num_blocks = viennacl::linalg::detail::getNumBlocksLinear(num_one_intervals, - mat_size > 512 ? VIENNACL_BISECT_MAX_THREADS_BLOCK : VIENNACL_BISECT_MAX_THREADS_BLOCK / 2); - dim3 grid_onei; - grid_onei.x = num_blocks; - grid_onei.y = 1, grid_onei.z = 1; - dim3 threads_onei(mat_size > 512 ? VIENNACL_BISECT_MAX_THREADS_BLOCK : VIENNACL_BISECT_MAX_THREADS_BLOCK / 2, 1, 1); - - - bisectKernelLarge_OneIntervals<<< grid_onei , threads_onei >>> - (viennacl::cuda_arg(input.g_a), - viennacl::cuda_arg(input.g_b) + 1, - mat_size, num_one_intervals, - viennacl::cuda_arg(result.g_left_one), - viennacl::cuda_arg(result.g_right_one), - viennacl::cuda_arg(result.g_pos_one), - precision - ); - viennacl::linalg::cuda::VIENNACL_CUDA_LAST_ERROR_CHECK("bisectKernelLarge_OneIntervals() FAILED."); -} - - -// process intervals that contained more than one eigenvalue after -// the first processing step -template<typename NumericT> -void bisectLarge_MultIntervals(const viennacl::linalg::detail::InputData<NumericT> &input, viennacl::linalg::detail::ResultDataLarge<NumericT> &result, - const unsigned int mat_size, - const NumericT precision) - { - // get the number of blocks of intervals that contain, in total when - // each interval contains only one eigenvalue, not more than - // MAX_THREADS_BLOCK threads - unsigned int num_blocks_mult = result.g_num_blocks_mult; - - // setup the execution environment - dim3 grid_mult(num_blocks_mult, 1, 1); - dim3 threads_mult(mat_size > 512 ? VIENNACL_BISECT_MAX_THREADS_BLOCK : VIENNACL_BISECT_MAX_THREADS_BLOCK / 2, 1, 1); - - bisectKernelLarge_MultIntervals<<< grid_mult, threads_mult >>> - (viennacl::cuda_arg(input.g_a), - viennacl::cuda_arg(input.g_b) + 1, - mat_size, - viennacl::cuda_arg(result.g_blocks_mult), - viennacl::cuda_arg(result.g_blocks_mult_sum), - viennacl::cuda_arg(result.g_left_mult), - viennacl::cuda_arg(result.g_right_mult), - viennacl::cuda_arg(result.g_left_count_mult), - viennacl::cuda_arg(result.g_right_count_mult), - viennacl::cuda_arg(result.g_lambda_mult), - viennacl::cuda_arg(result.g_pos_mult), - precision - ); - viennacl::linalg::cuda::VIENNACL_CUDA_LAST_ERROR_CHECK("bisectKernelLarge_MultIntervals() FAILED."); -} -} -} -} - -#endif http://git-wip-us.apache.org/repos/asf/mahout/blob/7ae549fa/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/bisect_kernel_large.hpp ---------------------------------------------------------------------- diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/bisect_kernel_large.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/bisect_kernel_large.hpp deleted file mode 100755 index 77c9773..0000000 --- a/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/bisect_kernel_large.hpp +++ /dev/null @@ -1,928 +0,0 @@ -#ifndef VIENNACL_LINALG_CUDA_BISECT_KERNEL_LARGE_HPP_ -#define VIENNACL_LINALG_CUDA_BISECT_KERNEL_LARGE_HPP_ - -/* ========================================================================= - Copyright (c) 2010-2016, Institute for Microelectronics, - Institute for Analysis and Scientific Computing, - TU Wien. - Portions of this software are copyright by UChicago Argonne, LLC. - - ----------------- - ViennaCL - The Vienna Computing Library - ----------------- - - Project Head: Karl Rupp [email protected] - - (A list of authors and contributors can be found in the manual) - - License: MIT (X11), see file LICENSE in the base directory -============================================================================= */ - - -/** @file viennacl/linalg/cuda/bisect_kernel_large.hpp - @brief First step of the bisection algorithm for the computation of eigenvalues. - - Implementation based on the sample provided with the CUDA 6.0 SDK, for which - the creation of derivative works is allowed by including the following statement: - "This software contains source code provided by NVIDIA Corporation." -*/ - - -/* Determine eigenvalues for large symmetric, tridiagonal matrix. First - step of the computation. */ - -// includes, project -#include "viennacl/linalg/detail/bisect/config.hpp" -#include "viennacl/linalg/detail/bisect/util.hpp" -// additional kernel -#include "viennacl/linalg/cuda/bisect_util.hpp" - -// declaration, forward - -namespace viennacl -{ -namespace linalg -{ -namespace cuda -{ -//////////////////////////////////////////////////////////////////////////////// -//! Write data to global memory -//////////////////////////////////////////////////////////////////////////////// -template<typename NumericT> -__device__ -void writeToGmem(const unsigned int tid, const unsigned int tid_2, - const unsigned int num_threads_active, - const unsigned int num_blocks_mult, - NumericT *g_left_one, NumericT *g_right_one, - unsigned int *g_pos_one, - NumericT *g_left_mult, NumericT *g_right_mult, - unsigned int *g_left_count_mult, - unsigned int *g_right_count_mult, - NumericT *s_left, NumericT *s_right, - unsigned short *s_left_count, unsigned short *s_right_count, - unsigned int *g_blocks_mult, - unsigned int *g_blocks_mult_sum, - unsigned short *s_compaction_list, - unsigned short *s_cl_helper, - unsigned int offset_mult_lambda - ) -{ - - if (tid < offset_mult_lambda) - { - - g_left_one[tid] = s_left[tid]; - g_right_one[tid] = s_right[tid]; - // right count can be used to order eigenvalues without sorting - g_pos_one[tid] = s_right_count[tid]; - } - else - { - - - g_left_mult[tid - offset_mult_lambda] = s_left[tid]; - g_right_mult[tid - offset_mult_lambda] = s_right[tid]; - g_left_count_mult[tid - offset_mult_lambda] = s_left_count[tid]; - g_right_count_mult[tid - offset_mult_lambda] = s_right_count[tid]; - } - - if (tid_2 < num_threads_active) - { - - if (tid_2 < offset_mult_lambda) - { - - g_left_one[tid_2] = s_left[tid_2]; - g_right_one[tid_2] = s_right[tid_2]; - // right count can be used to order eigenvalues without sorting - g_pos_one[tid_2] = s_right_count[tid_2]; - } - else - { - - g_left_mult[tid_2 - offset_mult_lambda] = s_left[tid_2]; - g_right_mult[tid_2 - offset_mult_lambda] = s_right[tid_2]; - g_left_count_mult[tid_2 - offset_mult_lambda] = s_left_count[tid_2]; - g_right_count_mult[tid_2 - offset_mult_lambda] = s_right_count[tid_2]; - } - - } // end writing out data - - __syncthreads(); - - // note that s_cl_blocking = s_compaction_list + 1;, that is by writing out - // s_compaction_list we write the exclusive scan result - if (tid <= num_blocks_mult) - { - g_blocks_mult[tid] = s_compaction_list[tid]; - g_blocks_mult_sum[tid] = s_cl_helper[tid]; - } - - if (tid_2 <= num_blocks_mult) - { - g_blocks_mult[tid_2] = s_compaction_list[tid_2]; - g_blocks_mult_sum[tid_2] = s_cl_helper[tid_2]; - } -} - -//////////////////////////////////////////////////////////////////////////////// -//! Perform final stream compaction before writing data to global memory -//////////////////////////////////////////////////////////////////////////////// -template<typename NumericT> -__device__ -void -compactStreamsFinal(const unsigned int tid, const unsigned int tid_2, - const unsigned int num_threads_active, - unsigned int &offset_mult_lambda, - NumericT *s_left, NumericT *s_right, - unsigned short *s_left_count, unsigned short *s_right_count, - unsigned short *s_cl_one, unsigned short *s_cl_mult, - unsigned short *s_cl_blocking, unsigned short *s_cl_helper, - unsigned int is_one_lambda, unsigned int is_one_lambda_2, - NumericT &left, NumericT &right, NumericT &left_2, NumericT &right_2, - unsigned int &left_count, unsigned int &right_count, - unsigned int &left_count_2, unsigned int &right_count_2, - unsigned int c_block_iend, unsigned int c_sum_block, - unsigned int c_block_iend_2, unsigned int c_sum_block_2 - ) -{ - // cache data before performing compaction - left = s_left[tid]; - right = s_right[tid]; - - if (tid_2 < num_threads_active) - { - - left_2 = s_left[tid_2]; - right_2 = s_right[tid_2]; - } - - __syncthreads(); - - // determine addresses for intervals containing multiple eigenvalues and - // addresses for blocks of intervals - unsigned int ptr_w = 0; - unsigned int ptr_w_2 = 0; - unsigned int ptr_blocking_w = 0; - unsigned int ptr_blocking_w_2 = 0; - - - - ptr_w = (1 == is_one_lambda) ? s_cl_one[tid] - : s_cl_mult[tid] + offset_mult_lambda; - - if (0 != c_block_iend) - { - ptr_blocking_w = s_cl_blocking[tid]; - } - - if (tid_2 < num_threads_active) - { - ptr_w_2 = (1 == is_one_lambda_2) ? s_cl_one[tid_2] - : s_cl_mult[tid_2] + offset_mult_lambda; - - if (0 != c_block_iend_2) - { - ptr_blocking_w_2 = s_cl_blocking[tid_2]; - } - } - - - __syncthreads(); - if(tid < num_threads_active) - { - // store compactly in shared mem - s_left[ptr_w] = left; - s_right[ptr_w] = right; - s_left_count[ptr_w] = left_count; - s_right_count[ptr_w] = right_count; - } - - - __syncthreads(); - if(tid == 1) - { - s_left[ptr_w] = left; - s_right[ptr_w] = right; - s_left_count[ptr_w] = left_count; - s_right_count[ptr_w] = right_count; - } - if (0 != c_block_iend) - { - s_cl_blocking[ptr_blocking_w + 1] = c_block_iend - 1; - s_cl_helper[ptr_blocking_w + 1] = c_sum_block; - } - - if (tid_2 < num_threads_active) - { - - // store compactly in shared mem - s_left[ptr_w_2] = left_2; - s_right[ptr_w_2] = right_2; - s_left_count[ptr_w_2] = left_count_2; - s_right_count[ptr_w_2] = right_count_2; - - if (0 != c_block_iend_2) - { - s_cl_blocking[ptr_blocking_w_2 + 1] = c_block_iend_2 - 1; - s_cl_helper[ptr_blocking_w_2 + 1] = c_sum_block_2; - } - } - -} - -//////////////////////////////////////////////////////////////////////////////// -//! Compute addresses to obtain compact list of block start addresses -//////////////////////////////////////////////////////////////////////////////// -inline __device__ -void -scanCompactBlocksStartAddress(const unsigned int tid, const unsigned int tid_2, - const unsigned int num_threads_compaction, - unsigned short *s_cl_blocking, - unsigned short *s_cl_helper - ) -{ - // prepare for second step of block generation: compaction of the block - // list itself to efficiently write out these - s_cl_blocking[tid] = s_cl_helper[tid]; - - if (tid_2 < num_threads_compaction) - { - s_cl_blocking[tid_2] = s_cl_helper[tid_2]; - } - - __syncthreads(); - - // additional scan to compact s_cl_blocking that permits to generate a - // compact list of eigenvalue blocks each one containing about - // VIENNACL_BISECT_MAX_THREADS_BLOCK eigenvalues (so that each of these blocks may be - // processed by one thread block in a subsequent processing step - - unsigned int offset = 1; - - // build scan tree - for (int d = (num_threads_compaction >> 1); d > 0; d >>= 1) - { - - __syncthreads(); - - if (tid < d) - { - - unsigned int ai = offset*(2*tid+1)-1; - unsigned int bi = offset*(2*tid+2)-1; - s_cl_blocking[bi] = s_cl_blocking[bi] + s_cl_blocking[ai]; - } - - offset <<= 1; - } - - // traverse down tree: first down to level 2 across - for (int d = 2; d < num_threads_compaction; d <<= 1) - { - - offset >>= 1; - __syncthreads(); - - // - if (tid < (d-1)) - { - - unsigned int ai = offset*(tid+1) - 1; - unsigned int bi = ai + (offset >> 1); - s_cl_blocking[bi] = s_cl_blocking[bi] + s_cl_blocking[ai]; - } - } - -} - -//////////////////////////////////////////////////////////////////////////////// -//! Perform scan to obtain number of eigenvalues before a specific block -//////////////////////////////////////////////////////////////////////////////// -inline __device__ -void -scanSumBlocks(const unsigned int tid, const unsigned int tid_2, - const unsigned int num_threads_active, - const unsigned int num_threads_compaction, - unsigned short *s_cl_blocking, - unsigned short *s_cl_helper) -{ - unsigned int offset = 1; - - // first step of scan to build the sum of elements within each block - // build up tree - for (int d = num_threads_compaction >> 1; d > 0; d >>= 1) - { - - __syncthreads(); - - if (tid < d) - { - - unsigned int ai = offset*(2*tid+1)-1; - unsigned int bi = offset*(2*tid+2)-1; - - s_cl_blocking[bi] += s_cl_blocking[ai]; - } - - offset *= 2; - } - - // first step of scan to build the sum of elements within each block - // traverse down tree - for (int d = 2; d < (num_threads_compaction - 1); d <<= 1) - { - - offset >>= 1; - __syncthreads(); - - if (tid < (d-1)) - { - - unsigned int ai = offset*(tid+1) - 1; - unsigned int bi = ai + (offset >> 1); - - s_cl_blocking[bi] += s_cl_blocking[ai]; - } - } - - __syncthreads(); - - if (0 == tid) - { - - // move last element of scan to last element that is valid - // necessary because the number of threads employed for scan is a power - // of two and not necessarily the number of active threasd - s_cl_helper[num_threads_active - 1] = - s_cl_helper[num_threads_compaction - 1]; - s_cl_blocking[num_threads_active - 1] = - s_cl_blocking[num_threads_compaction - 1]; - } -} - -//////////////////////////////////////////////////////////////////////////////// -//! Perform initial scan for compaction of intervals containing one and -//! multiple eigenvalues; also do initial scan to build blocks -//////////////////////////////////////////////////////////////////////////////// -inline __device__ -void -scanInitial(const unsigned int tid, const unsigned int tid_2, const unsigned int mat_size, - const unsigned int num_threads_active, - const unsigned int num_threads_compaction, - unsigned short *s_cl_one, unsigned short *s_cl_mult, - unsigned short *s_cl_blocking, unsigned short *s_cl_helper - ) -{ - - // perform scan to compactly write out the intervals containing one and - // multiple eigenvalues - // also generate tree for blocking of intervals containing multiple - // eigenvalues - - unsigned int offset = 1; - - // build scan tree - for (int d = (num_threads_compaction >> 1); d > 0; d >>= 1) - { - - __syncthreads(); - - if (tid < d) - { - - unsigned int ai = offset*(2*tid+1); - unsigned int bi = offset*(2*tid+2)-1; - - s_cl_one[bi] = s_cl_one[bi] + s_cl_one[ai - 1]; - s_cl_mult[bi] = s_cl_mult[bi] + s_cl_mult[ai - 1]; - - // s_cl_helper is binary and zero for an internal node and 1 for a - // root node of a tree corresponding to a block - // s_cl_blocking contains the number of nodes in each sub-tree at each - // iteration, the data has to be kept to compute the total number of - // eigenvalues per block that, in turn, is needed to efficiently - // write out data in the second step - if ((s_cl_helper[ai - 1] != 1) || (s_cl_helper[bi] != 1)) - { - - // check how many childs are non terminated - if (s_cl_helper[ai - 1] == 1) - { - // mark as terminated - s_cl_helper[bi] = 1; - } - else if (s_cl_helper[bi] == 1) - { - // mark as terminated - s_cl_helper[ai - 1] = 1; - } - else // both childs are non-terminated - { - - unsigned int temp = s_cl_blocking[bi] + s_cl_blocking[ai - 1]; - - if (temp > (mat_size > 512 ? VIENNACL_BISECT_MAX_THREADS_BLOCK : VIENNACL_BISECT_MAX_THREADS_BLOCK / 2)) - { - - // the two child trees have to form separate blocks, terminate trees - s_cl_helper[ai - 1] = 1; - s_cl_helper[bi] = 1; - } - else - { - // build up tree by joining subtrees - s_cl_blocking[bi] = temp; - s_cl_blocking[ai - 1] = 0; - } - } - } // end s_cl_helper update - - } - - offset <<= 1; - } - - - // traverse down tree, this only for stream compaction, not for block - // construction - for (int d = 2; d < num_threads_compaction; d <<= 1) - { - - offset >>= 1; - __syncthreads(); - - // - if (tid < (d-1)) - { - - unsigned int ai = offset*(tid+1) - 1; - unsigned int bi = ai + (offset >> 1); - - s_cl_one[bi] = s_cl_one[bi] + s_cl_one[ai]; - s_cl_mult[bi] = s_cl_mult[bi] + s_cl_mult[ai]; - } - } - -} - -//////////////////////////////////////////////////////////////////////////////// -//! Store all non-empty intervals resulting from the subdivision of the interval -//! currently processed by the thread -//////////////////////////////////////////////////////////////////////////////// -template<typename NumericT> -__device__ -void -storeNonEmptyIntervalsLarge(unsigned int addr, - const unsigned int num_threads_active, - NumericT *s_left, NumericT *s_right, - unsigned short *s_left_count, - unsigned short *s_right_count, - NumericT left, NumericT mid, NumericT right, - const unsigned short left_count, - const unsigned short mid_count, - const unsigned short right_count, - NumericT epsilon, - unsigned int &compact_second_chunk, - unsigned short *s_compaction_list, - unsigned int &is_active_second) -{ - // check if both child intervals are valid - if ((left_count != mid_count) && (mid_count != right_count)) - { - - storeInterval(addr, s_left, s_right, s_left_count, s_right_count, - left, mid, left_count, mid_count, epsilon); - - is_active_second = 1; - s_compaction_list[threadIdx.x] = 1; - compact_second_chunk = 1; - } - else - { - - // only one non-empty child interval - - // mark that no second child - is_active_second = 0; - s_compaction_list[threadIdx.x] = 0; - - // store the one valid child interval - if (left_count != mid_count) - { - storeInterval(addr, s_left, s_right, s_left_count, s_right_count, - left, mid, left_count, mid_count, epsilon); - } - else - { - storeInterval(addr, s_left, s_right, s_left_count, s_right_count, - mid, right, mid_count, right_count, epsilon); - } - } -} - -/** @brief Bisection to find eigenvalues of a real, symmetric, and tridiagonal matrix -* g_d diagonal elements in global memory -* g_s superdiagonal elements in global elements (stored so that the element *(g_s - 1) can be accessed and equals 0 -* n size of matrix -* lg lower bound of input interval (e.g. Gerschgorin interval) -* ug upper bound of input interval (e.g. Gerschgorin interval) -* lg_eig_count number of eigenvalues that are smaller than lg -* lu_eig_count number of eigenvalues that are smaller than lu -* epsilon desired accuracy of eigenvalues to compute -*/ -template<typename NumericT> -__global__ -void -bisectKernelLarge(const NumericT *g_d, const NumericT *g_s, const unsigned int n, - const NumericT lg, const NumericT ug, - const unsigned int lg_eig_count, - const unsigned int ug_eig_count, - NumericT epsilon, - unsigned int *g_num_one, - unsigned int *g_num_blocks_mult, - NumericT *g_left_one, NumericT *g_right_one, - unsigned int *g_pos_one, - NumericT *g_left_mult, NumericT *g_right_mult, - unsigned int *g_left_count_mult, - unsigned int *g_right_count_mult, - unsigned int *g_blocks_mult, - unsigned int *g_blocks_mult_sum - ) -{ - const unsigned int tid = threadIdx.x; - - // intervals (store left and right because the subdivision tree is in general - // not dense - __shared__ NumericT s_left[2 * VIENNACL_BISECT_MAX_THREADS_BLOCK + 1]; - __shared__ NumericT s_right[2 * VIENNACL_BISECT_MAX_THREADS_BLOCK + 1]; - - // number of eigenvalues that are smaller than s_left / s_right - // (correspondence is realized via indices) - __shared__ unsigned short s_left_count[2 * VIENNACL_BISECT_MAX_THREADS_BLOCK + 1]; - __shared__ unsigned short s_right_count[2 * VIENNACL_BISECT_MAX_THREADS_BLOCK + 1]; - - // helper for stream compaction - __shared__ unsigned short s_compaction_list[2 * VIENNACL_BISECT_MAX_THREADS_BLOCK + 1]; - - // state variables for whole block - // if 0 then compaction of second chunk of child intervals is not necessary - // (because all intervals had exactly one non-dead child) - __shared__ unsigned int compact_second_chunk; - // if 1 then all threads are converged - __shared__ unsigned int all_threads_converged; - - // number of currently active threads - __shared__ unsigned int num_threads_active; - - // number of threads to use for stream compaction - __shared__ unsigned int num_threads_compaction; - - // helper for exclusive scan - unsigned short *s_compaction_list_exc = s_compaction_list + 1; - - - // variables for currently processed interval - // left and right limit of active interval - NumericT left = 0.0f; - NumericT right = 0.0f; - unsigned int left_count = 0; - unsigned int right_count = 0; - // midpoint of active interval - NumericT mid = 0.0f; - // number of eigenvalues smaller then mid - unsigned int mid_count = 0; - // helper for stream compaction (tracking of threads generating second child) - unsigned int is_active_second = 0; - - // initialize lists - s_compaction_list[tid] = 0; - s_left[tid] = 0; - s_right[tid] = 0; - s_left_count[tid] = 0; - s_right_count[tid] = 0; - - __syncthreads(); - - // set up initial configuration - if (0 == tid) - { - - s_left[0] = lg; - s_right[0] = ug; - s_left_count[0] = lg_eig_count; - s_right_count[0] = ug_eig_count; - - compact_second_chunk = 0; - num_threads_active = 1; - - num_threads_compaction = 1; - - all_threads_converged = 1; - } - - __syncthreads(); - - // for all active threads read intervals from the last level - // the number of (worst case) active threads per level l is 2^l - // determine coarse intervals. On these intervals the kernel for one or for multiple eigenvalues - // will be executed in the second step - while(true) - { - s_compaction_list[tid] = 0; - s_compaction_list[tid + VIENNACL_BISECT_MAX_THREADS_BLOCK] = 0; - s_compaction_list[2 * VIENNACL_BISECT_MAX_THREADS_BLOCK] = 0; - subdivideActiveInterval(tid, s_left, s_right, s_left_count, s_right_count, - num_threads_active, - left, right, left_count, right_count, - mid, all_threads_converged); - - __syncthreads(); - - // check if done - if (1 == all_threads_converged) - { - break; - } - - // compute number of eigenvalues smaller than mid - // use all threads for reading the necessary matrix data from global - // memory - // use s_left and s_right as scratch space for diagonal and - // superdiagonal of matrix - mid_count = computeNumSmallerEigenvalsLarge(g_d, g_s, n, - mid, threadIdx.x, - num_threads_active, - s_left, s_right, - (left == right)); - - __syncthreads(); - - // store intervals - // for all threads store the first child interval in a continuous chunk of - // memory, and the second child interval -- if it exists -- in a second - // chunk; it is likely that all threads reach convergence up to - // \a epsilon at the same level; furthermore, for higher level most / all - // threads will have only one child, storing the first child compactly will - // (first) avoid to perform a compaction step on the first chunk, (second) - // make it for higher levels (when all threads / intervals have - // exactly one child) unnecessary to perform a compaction of the second - // chunk - if (tid < num_threads_active) - { - - if (left != right) - { - - // store intervals - storeNonEmptyIntervalsLarge(tid, num_threads_active, - s_left, s_right, - s_left_count, s_right_count, - left, mid, right, - left_count, mid_count, right_count, - epsilon, compact_second_chunk, - s_compaction_list_exc, - is_active_second); - } - else - { - - // re-write converged interval (has to be stored again because s_left - // and s_right are used as scratch space for - // computeNumSmallerEigenvalsLarge() - s_left[tid] = left; - s_right[tid] = left; - s_left_count[tid] = left_count; - s_right_count[tid] = right_count; - - is_active_second = 0; - } - } - - // necessary so that compact_second_chunk is up-to-date - __syncthreads(); - - // perform compaction of chunk where second children are stored - // scan of (num_threads_active / 2) elements, thus at most - // (num_threads_active / 4) threads are needed - if (compact_second_chunk > 0) - { - - // create indices for compaction - createIndicesCompaction(s_compaction_list_exc, num_threads_compaction); - } - __syncthreads(); - - if (compact_second_chunk > 0) - { - compactIntervals(s_left, s_right, s_left_count, s_right_count, - mid, right, mid_count, right_count, - s_compaction_list, num_threads_active, - is_active_second); - } - - __syncthreads(); - - // update state variables - if (0 == tid) - { - - // update number of active threads with result of reduction - num_threads_active += s_compaction_list[num_threads_active]; - num_threads_compaction = ceilPow2(num_threads_active); - - compact_second_chunk = 0; - all_threads_converged = 1; - } - - __syncthreads(); - - if (num_threads_compaction > blockDim.x) - { - break; - } - - } - - __syncthreads(); - - // generate two lists of intervals; one with intervals that contain one - // eigenvalue (or are converged), and one with intervals that need further - // subdivision - - // perform two scans in parallel - - unsigned int left_count_2; - unsigned int right_count_2; - - unsigned int tid_2 = tid + blockDim.x; - - // cache in per thread registers so that s_left_count and s_right_count - // can be used for scans - left_count = s_left_count[tid]; - right_count = s_right_count[tid]; - - // some threads have to cache data for two intervals - if (tid_2 < num_threads_active) - { - left_count_2 = s_left_count[tid_2]; - right_count_2 = s_right_count[tid_2]; - } - - // compaction list for intervals containing one and multiple eigenvalues - // do not affect first element for exclusive scan - unsigned short *s_cl_one = s_left_count + 1; - unsigned short *s_cl_mult = s_right_count + 1; - - // compaction list for generating blocks of intervals containing multiple - // eigenvalues - unsigned short *s_cl_blocking = s_compaction_list_exc; - // helper compaction list for generating blocks of intervals - __shared__ unsigned short s_cl_helper[2 * VIENNACL_BISECT_MAX_THREADS_BLOCK + 1]; - - if (0 == tid) - { - // set to 0 for exclusive scan - s_left_count[0] = 0; - s_right_count[0] = 0; - - } - - __syncthreads(); - - // flag if interval contains one or multiple eigenvalues - unsigned int is_one_lambda = 0; - unsigned int is_one_lambda_2 = 0; - - // number of eigenvalues in the interval - unsigned int multiplicity = right_count - left_count; - is_one_lambda = (1 == multiplicity); - - s_cl_one[tid] = is_one_lambda; - s_cl_mult[tid] = (! is_one_lambda); - - // (note: s_cl_blocking is non-zero only where s_cl_mult[] is non-zero) - s_cl_blocking[tid] = (1 == is_one_lambda) ? 0 : multiplicity; - s_cl_helper[tid] = 0; - - if (tid_2 < num_threads_active) - { - - unsigned int multiplicity = right_count_2 - left_count_2; - is_one_lambda_2 = (1 == multiplicity); - - s_cl_one[tid_2] = is_one_lambda_2; - s_cl_mult[tid_2] = (! is_one_lambda_2); - - // (note: s_cl_blocking is non-zero only where s_cl_mult[] is non-zero) - s_cl_blocking[tid_2] = (1 == is_one_lambda_2) ? 0 : multiplicity; - s_cl_helper[tid_2] = 0; - } - else if (tid_2 < (2 * (n > 512 ? VIENNACL_BISECT_MAX_THREADS_BLOCK : VIENNACL_BISECT_MAX_THREADS_BLOCK / 2) + 1)) - { - - // clear - s_cl_blocking[tid_2] = 0; - s_cl_helper[tid_2] = 0; - } - - - scanInitial(tid, tid_2, n, num_threads_active, num_threads_compaction, - s_cl_one, s_cl_mult, s_cl_blocking, s_cl_helper); - - __syncthreads(); - - scanSumBlocks(tid, tid_2, num_threads_active, - num_threads_compaction, s_cl_blocking, s_cl_helper); - - // end down sweep of scan - __syncthreads(); - - unsigned int c_block_iend = 0; - unsigned int c_block_iend_2 = 0; - unsigned int c_sum_block = 0; - unsigned int c_sum_block_2 = 0; - - // for each thread / interval that corresponds to root node of interval block - // store start address of block and total number of eigenvalues in all blocks - // before this block (particular thread is irrelevant, constraint is to - // have a subset of threads so that one and only one of them is in each - // interval) - if (1 == s_cl_helper[tid]) - { - - c_block_iend = s_cl_mult[tid] + 1; - c_sum_block = s_cl_blocking[tid]; - } - - if (1 == s_cl_helper[tid_2]) - { - - c_block_iend_2 = s_cl_mult[tid_2] + 1; - c_sum_block_2 = s_cl_blocking[tid_2]; - } - - scanCompactBlocksStartAddress(tid, tid_2, num_threads_compaction, - s_cl_blocking, s_cl_helper); - - - // finished second scan for s_cl_blocking - __syncthreads(); - - // determine the global results - __shared__ unsigned int num_blocks_mult; - __shared__ unsigned int num_mult; - __shared__ unsigned int offset_mult_lambda; - - if (0 == tid) - { - - num_blocks_mult = s_cl_blocking[num_threads_active - 1]; - offset_mult_lambda = s_cl_one[num_threads_active - 1]; - num_mult = s_cl_mult[num_threads_active - 1]; - - *g_num_one = offset_mult_lambda; - *g_num_blocks_mult = num_blocks_mult; - } - - __syncthreads(); - - NumericT left_2, right_2; - --s_cl_one; - --s_cl_mult; - --s_cl_blocking; - - __syncthreads(); - compactStreamsFinal(tid, tid_2, num_threads_active, offset_mult_lambda, - s_left, s_right, s_left_count, s_right_count, - s_cl_one, s_cl_mult, s_cl_blocking, s_cl_helper, - is_one_lambda, is_one_lambda_2, - left, right, left_2, right_2, - left_count, right_count, left_count_2, right_count_2, - c_block_iend, c_sum_block, c_block_iend_2, c_sum_block_2 - ); - - __syncthreads(); - - // final adjustment before writing out data to global memory - if (0 == tid) - { - s_cl_blocking[num_blocks_mult] = num_mult; - s_cl_helper[0] = 0; - } - - __syncthreads(); - - // write to global memory - writeToGmem(tid, tid_2, num_threads_active, num_blocks_mult, - g_left_one, g_right_one, g_pos_one, - g_left_mult, g_right_mult, g_left_count_mult, g_right_count_mult, - s_left, s_right, s_left_count, s_right_count, - g_blocks_mult, g_blocks_mult_sum, - s_compaction_list, s_cl_helper, offset_mult_lambda); - -} -} -} -} -#endif // #ifndef _BISECT_KERNEL_LARGE_H_ http://git-wip-us.apache.org/repos/asf/mahout/blob/7ae549fa/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/bisect_kernel_large_multi.hpp ---------------------------------------------------------------------- diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/bisect_kernel_large_multi.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/bisect_kernel_large_multi.hpp deleted file mode 100755 index a670256..0000000 --- a/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/bisect_kernel_large_multi.hpp +++ /dev/null @@ -1,277 +0,0 @@ -#ifndef VIENNACL_LINALG_CUDA_BISECT_KERNEL_LARGE_MULTI_HPP_ -#define VIENNACL_LINALG_CUDA_BISECT_KERNEL_LARGE_MULTI_HPP_ - -/* ========================================================================= - Copyright (c) 2010-2016, Institute for Microelectronics, - Institute for Analysis and Scientific Computing, - TU Wien. - Portions of this software are copyright by UChicago Argonne, LLC. - - ----------------- - ViennaCL - The Vienna Computing Library - ----------------- - - Project Head: Karl Rupp [email protected] - - (A list of authors and contributors can be found in the manual) - - License: MIT (X11), see file LICENSE in the base directory -============================================================================= */ - - -/** @file viennacl/linalg/cuda/bisect_kernel_large_multi.hpp - @brief Second step of the bisection algorithm for the computation of eigenvalues for large matrices. - - Implementation based on the sample provided with the CUDA 6.0 SDK, for which - the creation of derivative works is allowed by including the following statement: - "This software contains source code provided by NVIDIA Corporation." -*/ - -/* Perform second step of bisection algorithm for large matrices for - * intervals that contained after the first step more than one eigenvalue - */ - -// includes, project -#include "viennacl/linalg/detail/bisect/config.hpp" -#include "viennacl/linalg/detail/bisect/util.hpp" -// additional kernel -#include "viennacl/linalg/cuda/bisect_util.hpp" - -namespace viennacl -{ -namespace linalg -{ -namespace cuda -{ -//////////////////////////////////////////////////////////////////////////////// -//! Perform second step of bisection algorithm for large matrices for -//! intervals that after the first step contained more than one eigenvalue -//! @param g_d diagonal elements of symmetric, tridiagonal matrix -//! @param g_s superdiagonal elements of symmetric, tridiagonal matrix -//! @param n matrix size -//! @param blocks_mult start addresses of blocks of intervals that are -//! processed by one block of threads, each of the -//! intervals contains more than one eigenvalue -//! @param blocks_mult_sum total number of eigenvalues / singleton intervals -//! in one block of intervals -//! @param g_left left limits of intervals -//! @param g_right right limits of intervals -//! @param g_left_count number of eigenvalues less than left limits -//! @param g_right_count number of eigenvalues less than right limits -//! @param g_lambda final eigenvalue -//! @param g_pos index of eigenvalue (in ascending order) -//! @param precision desired precision of eigenvalues -//////////////////////////////////////////////////////////////////////////////// -template<typename NumericT> -__global__ -void -bisectKernelLarge_MultIntervals(const NumericT *g_d, const NumericT *g_s, const unsigned int n, - unsigned int *blocks_mult, - unsigned int *blocks_mult_sum, - NumericT *g_left, NumericT *g_right, - unsigned int *g_left_count, - unsigned int *g_right_count, - NumericT *g_lambda, unsigned int *g_pos, - NumericT precision - ) -{ - const unsigned int tid = threadIdx.x; - - // left and right limits of interval - __shared__ NumericT s_left[2 * VIENNACL_BISECT_MAX_THREADS_BLOCK]; - __shared__ NumericT s_right[2 * VIENNACL_BISECT_MAX_THREADS_BLOCK]; - - // number of eigenvalues smaller than interval limits - __shared__ unsigned int s_left_count[2 * VIENNACL_BISECT_MAX_THREADS_BLOCK]; - __shared__ unsigned int s_right_count[2 * VIENNACL_BISECT_MAX_THREADS_BLOCK]; - - // helper array for chunk compaction of second chunk - __shared__ unsigned int s_compaction_list[2 * VIENNACL_BISECT_MAX_THREADS_BLOCK + 1]; - // compaction list helper for exclusive scan - unsigned int *s_compaction_list_exc = s_compaction_list + 1; - - // flag if all threads are converged - __shared__ unsigned int all_threads_converged; - // number of active threads - __shared__ unsigned int num_threads_active; - // number of threads to employ for compaction - __shared__ unsigned int num_threads_compaction; - // flag if second chunk has to be compacted - __shared__ unsigned int compact_second_chunk; - - // parameters of block of intervals processed by this block of threads - __shared__ unsigned int c_block_start; - __shared__ unsigned int c_block_end; - __shared__ unsigned int c_block_offset_output; - - // midpoint of currently active interval of the thread - NumericT mid = 0.0f; - // number of eigenvalues smaller than \a mid - unsigned int mid_count = 0; - // current interval parameter - NumericT left = 0.0f; - NumericT right = 0.0f; - unsigned int left_count = 0; - unsigned int right_count = 0; - // helper for compaction, keep track which threads have a second child - unsigned int is_active_second = 0; - - - __syncthreads(); - // initialize common start conditions - if (0 == tid) - { - - c_block_start = blocks_mult[blockIdx.x]; - c_block_end = blocks_mult[blockIdx.x + 1]; - c_block_offset_output = blocks_mult_sum[blockIdx.x]; - - - num_threads_active = c_block_end - c_block_start; - s_compaction_list[0] = 0; - num_threads_compaction = ceilPow2(num_threads_active); - - all_threads_converged = 1; - compact_second_chunk = 0; - } - - s_left_count [tid] = 42; - s_right_count[tid] = 42; - s_left_count [tid + VIENNACL_BISECT_MAX_THREADS_BLOCK] = 0; - s_right_count[tid + VIENNACL_BISECT_MAX_THREADS_BLOCK] = 0; - - __syncthreads(); - - - // read data into shared memory - if (tid < num_threads_active) - { - s_left[tid] = g_left[c_block_start + tid]; - s_right[tid] = g_right[c_block_start + tid]; - s_left_count[tid] = g_left_count[c_block_start + tid]; - s_right_count[tid] = g_right_count[c_block_start + tid]; - } - - __syncthreads(); - unsigned int iter = 0; - // do until all threads converged - while (true) - { - iter++; - //for (int iter=0; iter < 0; iter++) { - s_compaction_list[threadIdx.x] = 0; - s_compaction_list[threadIdx.x + blockDim.x] = 0; - s_compaction_list[2 * VIENNACL_BISECT_MAX_THREADS_BLOCK] = 0; - - // subdivide interval if currently active and not already converged - subdivideActiveIntervalMulti(tid, s_left, s_right, - s_left_count, s_right_count, - num_threads_active, - left, right, left_count, right_count, - mid, all_threads_converged); - __syncthreads(); - - // stop if all eigenvalues have been found - if (1 == all_threads_converged) - { - - break; - } - - // compute number of eigenvalues smaller than mid for active and not - // converged intervals, use all threads for loading data from gmem and - // s_left and s_right as scratch space to store the data load from gmem - // in shared memory - mid_count = computeNumSmallerEigenvalsLarge(g_d, g_s, n, - mid, tid, num_threads_active, - s_left, s_right, - (left == right)); - - __syncthreads(); - - if (tid < num_threads_active) - { - - // store intervals - if (left != right) - { - - storeNonEmptyIntervals(tid, num_threads_active, - s_left, s_right, s_left_count, s_right_count, - left, mid, right, - left_count, mid_count, right_count, - precision, compact_second_chunk, - s_compaction_list_exc, - is_active_second); - - } - else - { - - storeIntervalConverged(s_left, s_right, s_left_count, s_right_count, - left, mid, right, - left_count, mid_count, right_count, - s_compaction_list_exc, compact_second_chunk, - num_threads_active, - is_active_second); - - } - } - - __syncthreads(); - - // compact second chunk of intervals if any of the threads generated - // two child intervals - if (1 == compact_second_chunk) - { - - createIndicesCompaction(s_compaction_list_exc, num_threads_compaction); - compactIntervals(s_left, s_right, s_left_count, s_right_count, - mid, right, mid_count, right_count, - s_compaction_list, num_threads_active, - is_active_second); - } - - __syncthreads(); - - // update state variables - if (0 == tid) - { - num_threads_active += s_compaction_list[num_threads_active]; - num_threads_compaction = ceilPow2(num_threads_active); - - compact_second_chunk = 0; - all_threads_converged = 1; - } - - __syncthreads(); - - // clear - s_compaction_list_exc[threadIdx.x] = 0; - s_compaction_list_exc[threadIdx.x + blockDim.x] = 0; - - if (num_threads_compaction > blockDim.x) - { - break; - } - - - __syncthreads(); - - } // end until all threads converged - - // write data back to global memory - if (tid < num_threads_active) - { - - unsigned int addr = c_block_offset_output + tid; - - g_lambda[addr] = s_left[tid]; - g_pos[addr] = s_right_count[tid]; - } -} -} // namespace cuda -} // namespace linalg -} // namespace viennacl - -#endif // #ifndef VIENNACL_LINALG_CUDA_BISECT_KERNEL_LARGE_MULTI_HPP_
