http://git-wip-us.apache.org/repos/asf/mahout/blob/7ae549fa/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/bisect_kernel_large_onei.hpp ---------------------------------------------------------------------- diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/bisect_kernel_large_onei.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/bisect_kernel_large_onei.hpp deleted file mode 100755 index 960f5c2..0000000 --- a/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/bisect_kernel_large_onei.hpp +++ /dev/null @@ -1,180 +0,0 @@ -#ifndef VIENNACL_LINALG_CUDA_BISECT_KERNEL_LARGE_ONEI_HPP_ -#define VIENNACL_LINALG_CUDA_BISECT_KERNEL_LARGE_ONEI_HPP_ - -/* ========================================================================= - Copyright (c) 2010-2016, Institute for Microelectronics, - Institute for Analysis and Scientific Computing, - TU Wien. - Portions of this software are copyright by UChicago Argonne, LLC. - - ----------------- - ViennaCL - The Vienna Computing Library - ----------------- - - Project Head: Karl Rupp [email protected] - - (A list of authors and contributors can be found in the manual) - - License: MIT (X11), see file LICENSE in the base directory -============================================================================= */ - - -/** @file viennacl/linalg/cuda/bisect_kernel_large_onei.hpp - @brief Determine eigenvalues for large matrices for intervals that contained after the first step one eigenvalue - - Implementation based on the sample provided with the CUDA 6.0 SDK, for which - the creation of derivative works is allowed by including the following statement: - "This software contains source code provided by NVIDIA Corporation." -*/ - -// includes, project -#include "viennacl/linalg/detail/bisect/config.hpp" -#include "viennacl/linalg/detail/bisect/util.hpp" -// additional kernel -#include "viennacl/linalg/cuda/bisect_util.hpp" - -namespace viennacl -{ -namespace linalg -{ -namespace cuda -{ -//////////////////////////////////////////////////////////////////////////////// -//! Determine eigenvalues for large matrices for intervals that after -//! the first step contained one eigenvalue -//! @param g_d diagonal elements of symmetric, tridiagonal matrix -//! @param g_s superdiagonal elements of symmetric, tridiagonal matrix -//! @param n matrix size -//! @param num_intervals total number of intervals containing one eigenvalue -//! after the first step -//! @param g_left left interval limits -//! @param g_right right interval limits -//! @param g_pos index of interval / number of intervals that are smaller than -//! right interval limit -//! @param precision desired precision of eigenvalues -//////////////////////////////////////////////////////////////////////////////// -template<typename NumericT> -__global__ -void -bisectKernelLarge_OneIntervals(const NumericT *g_d, const NumericT *g_s, const unsigned int n, - unsigned int num_intervals, - NumericT *g_left, NumericT *g_right, - unsigned int *g_pos, - NumericT precision) -{ - - const unsigned int gtid = (blockDim.x * blockIdx.x) + threadIdx.x; - - __shared__ NumericT s_left_scratch[VIENNACL_BISECT_MAX_THREADS_BLOCK]; - __shared__ NumericT s_right_scratch[VIENNACL_BISECT_MAX_THREADS_BLOCK]; - - // active interval of thread - // left and right limit of current interval - NumericT left, right; - // number of threads smaller than the right limit (also corresponds to the - // global index of the eigenvalues contained in the active interval) - unsigned int right_count; - // flag if current thread converged - unsigned int converged = 0; - // midpoint when current interval is subdivided - NumericT mid = 0.0f; - // number of eigenvalues less than mid - unsigned int mid_count = 0; - - // read data from global memory - if (gtid < num_intervals) - { - left = g_left[gtid]; - right = g_right[gtid]; - right_count = g_pos[gtid]; - } - - - // flag to determine if all threads converged to eigenvalue - __shared__ unsigned int converged_all_threads; - - // initialized shared flag - if (0 == threadIdx.x) - { - converged_all_threads = 0; - } - - __syncthreads(); - - // process until all threads converged to an eigenvalue - while (true) - { - - converged_all_threads = 1; - - // update midpoint for all active threads - if ((gtid < num_intervals) && (0 == converged)) - { - mid = computeMidpoint(left, right); - } - - // find number of eigenvalues that are smaller than midpoint - mid_count = computeNumSmallerEigenvalsLarge(g_d, g_s, n, - mid, gtid, num_intervals, - s_left_scratch, - s_right_scratch, - converged); - - __syncthreads(); - - // for all active threads - if ((gtid < num_intervals) && (0 == converged)) - { - - // update intervals -- always one child interval survives - if (right_count == mid_count) - { - right = mid; - } - else - { - left = mid; - } - - // check for convergence - NumericT t0 = right - left; - NumericT t1 = max(abs(right), abs(left)) * precision; - - if (t0 < min(precision, t1)) - { - NumericT lambda = computeMidpoint(left, right); - left = lambda; - right = lambda; - - converged = 1; - } - else - { - converged_all_threads = 0; - } - } - - __syncthreads(); - - if (1 == converged_all_threads) - { - break; - } - - __syncthreads(); - } - - // write data back to global memory - __syncthreads(); - - if (gtid < num_intervals) - { - // intervals converged so left and right interval limit are both identical - // and identical to the eigenvalue - g_left[gtid] = left; - } -} -} // namespace cuda -} // namespace linalg -} // namespace viennacl -#endif // #ifndef VIENNACL_LINALG_CUDA_BISECT_KERNEL_LARGE_ONEI_HPP_
http://git-wip-us.apache.org/repos/asf/mahout/blob/7ae549fa/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/bisect_kernel_small.hpp ---------------------------------------------------------------------- diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/bisect_kernel_small.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/bisect_kernel_small.hpp deleted file mode 100755 index 310b381..0000000 --- a/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/bisect_kernel_small.hpp +++ /dev/null @@ -1,261 +0,0 @@ -#ifndef VIENNACL_LINALG_CUDA_BISECT_KERNEL_SMALL_HPP_ -#define VIENNACL_LINALG_CUDA_BISECT_KERNEL_SMALL_HPP_ - -/* ========================================================================= - Copyright (c) 2010-2016, Institute for Microelectronics, - Institute for Analysis and Scientific Computing, - TU Wien. - Portions of this software are copyright by UChicago Argonne, LLC. - - ----------------- - ViennaCL - The Vienna Computing Library - ----------------- - - Project Head: Karl Rupp [email protected] - - (A list of authors and contributors can be found in the manual) - - License: MIT (X11), see file LICENSE in the base directory -============================================================================= */ - - -/** @file viennacl/linalg/cuda/bisect_kernel_small.hpp - @brief Determine eigenvalues for small symmetric, tridiagonal matrix - - Implementation based on the sample provided with the CUDA 6.0 SDK, for which - the creation of derivative works is allowed by including the following statement: - "This software contains source code provided by NVIDIA Corporation." -*/ - -// includes, project -#include "viennacl/linalg/detail/bisect/config.hpp" -#include "viennacl/linalg/detail/bisect/util.hpp" -// additional kernel -#include "viennacl/linalg/cuda/bisect_util.hpp" - -namespace viennacl -{ -namespace linalg -{ -namespace cuda -{ - -/** @brief Bisection to find eigenvalues of a real, symmetric, and tridiagonal matrix -* -* @param g_d diagonal elements in global memory -* @param g_s superdiagonal elements in global elements (stored so that the element *(g_s - 1) can be accessed an equals 0 -* @param n size of matrix -* @param g_left helper array -* @param g_right helper array -* @param g_left_count helper array -* @param g_right_count helper array -* @param lg lower bound of input interval (e.g. Gerschgorin interval) -* @param ug upper bound of input interval (e.g. Gerschgorin interval) -* @param lg_eig_count number of eigenvalues that are smaller than lg -* @param ug_eig_count number of eigenvalues that are smaller than lu -* @param epsilon desired accuracy of eigenvalues to compute -*/ -template<typename NumericT> -__global__ -void -bisectKernelSmall(const NumericT *g_d, const NumericT *g_s, const unsigned int n, - NumericT * g_left, NumericT *g_right, - unsigned int *g_left_count, unsigned int *g_right_count, - const NumericT lg, const NumericT ug, - const unsigned int lg_eig_count, const unsigned int ug_eig_count, - NumericT epsilon - ) -{ - // intervals (store left and right because the subdivision tree is in general - // not dense - __shared__ NumericT s_left[VIENNACL_BISECT_MAX_THREADS_BLOCK_SMALL_MATRIX]; - __shared__ NumericT s_right[VIENNACL_BISECT_MAX_THREADS_BLOCK_SMALL_MATRIX]; - - // number of eigenvalues that are smaller than s_left / s_right - // (correspondence is realized via indices) - __shared__ unsigned int s_left_count[VIENNACL_BISECT_MAX_THREADS_BLOCK_SMALL_MATRIX]; - __shared__ unsigned int s_right_count[VIENNACL_BISECT_MAX_THREADS_BLOCK_SMALL_MATRIX]; - - // helper for stream compaction - __shared__ unsigned int - s_compaction_list[VIENNACL_BISECT_MAX_THREADS_BLOCK_SMALL_MATRIX + 1]; - - // state variables for whole block - // if 0 then compaction of second chunk of child intervals is not necessary - // (because all intervals had exactly one non-dead child) - __shared__ unsigned int compact_second_chunk; - __shared__ unsigned int all_threads_converged; - - // number of currently active threads - __shared__ unsigned int num_threads_active; - - // number of threads to use for stream compaction - __shared__ unsigned int num_threads_compaction; - - // helper for exclusive scan - unsigned int *s_compaction_list_exc = s_compaction_list + 1; - - - // variables for currently processed interval - // left and right limit of active interval - NumericT left = 0.0f; - NumericT right = 0.0f; - unsigned int left_count = 0; - unsigned int right_count = 0; - // midpoint of active interval - NumericT mid = 0.0f; - // number of eigenvalues smaller then mid - unsigned int mid_count = 0; - // affected from compaction - unsigned int is_active_second = 0; - - s_compaction_list[threadIdx.x] = 0; - s_left[threadIdx.x] = 0; - s_right[threadIdx.x] = 0; - s_left_count[threadIdx.x] = 0; - s_right_count[threadIdx.x] = 0; - - __syncthreads(); - - // set up initial configuration - if (0 == threadIdx.x) - { - s_left[0] = lg; - s_right[0] = ug; - s_left_count[0] = lg_eig_count; - s_right_count[0] = ug_eig_count; - - compact_second_chunk = 0; - num_threads_active = 1; - - num_threads_compaction = 1; - } - - // for all active threads read intervals from the last level - // the number of (worst case) active threads per level l is 2^l - while (true) - { - - all_threads_converged = 1; - __syncthreads(); - - is_active_second = 0; - subdivideActiveIntervalMulti(threadIdx.x, - s_left, s_right, s_left_count, s_right_count, - num_threads_active, - left, right, left_count, right_count, - mid, all_threads_converged); - - __syncthreads(); - - // check if done - if (1 == all_threads_converged) - { - break; - } - - __syncthreads(); - - // compute number of eigenvalues smaller than mid - // use all threads for reading the necessary matrix data from global - // memory - // use s_left and s_right as scratch space for diagonal and - // superdiagonal of matrix - mid_count = computeNumSmallerEigenvals(g_d, g_s, n, mid, - threadIdx.x, num_threads_active, - s_left, s_right, - (left == right)); - - __syncthreads(); - - // store intervals - // for all threads store the first child interval in a continuous chunk of - // memory, and the second child interval -- if it exists -- in a second - // chunk; it is likely that all threads reach convergence up to - // \a epsilon at the same level; furthermore, for higher level most / all - // threads will have only one child, storing the first child compactly will - // (first) avoid to perform a compaction step on the first chunk, (second) - // make it for higher levels (when all threads / intervals have - // exactly one child) unnecessary to perform a compaction of the second - // chunk - if (threadIdx.x < num_threads_active) - { - - if (left != right) - { - - // store intervals - storeNonEmptyIntervals(threadIdx.x, num_threads_active, - s_left, s_right, s_left_count, s_right_count, - left, mid, right, - left_count, mid_count, right_count, - epsilon, compact_second_chunk, - s_compaction_list_exc, - is_active_second); - } - else - { - - storeIntervalConverged(s_left, s_right, s_left_count, s_right_count, - left, mid, right, - left_count, mid_count, right_count, - s_compaction_list_exc, compact_second_chunk, - num_threads_active, - is_active_second); - } - } - - // necessary so that compact_second_chunk is up-to-date - __syncthreads(); - - // perform compaction of chunk where second children are stored - // scan of (num_threads_active / 2) elements, thus at most - // (num_threads_active / 4) threads are needed - if (compact_second_chunk > 0) - { - - createIndicesCompaction(s_compaction_list_exc, num_threads_compaction); - - compactIntervals(s_left, s_right, s_left_count, s_right_count, - mid, right, mid_count, right_count, - s_compaction_list, num_threads_active, - is_active_second); - } - - __syncthreads(); - - if (0 == threadIdx.x) - { - - // update number of active threads with result of reduction - num_threads_active += s_compaction_list[num_threads_active]; - - num_threads_compaction = ceilPow2(num_threads_active); - - compact_second_chunk = 0; - } - - __syncthreads(); - - } - - __syncthreads(); - - // write resulting intervals to global mem - // for all threads write if they have been converged to an eigenvalue to - // a separate array - - // at most n valid intervals - if (threadIdx.x < n) - { - - // intervals converged so left and right limit are identical - g_left[threadIdx.x] = s_left[threadIdx.x]; - // left count is sufficient to have global order - g_left_count[threadIdx.x] = s_left_count[threadIdx.x]; - } -} -} // namespace cuda -} // namespace linalg -} // namespace viennacl -#endif // #ifndef _BISECT_KERNEL_SMALL_H_ http://git-wip-us.apache.org/repos/asf/mahout/blob/7ae549fa/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/bisect_util.hpp ---------------------------------------------------------------------- diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/bisect_util.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/bisect_util.hpp deleted file mode 100755 index e2e262c..0000000 --- a/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/bisect_util.hpp +++ /dev/null @@ -1,613 +0,0 @@ -#ifndef VIENNACL_LINALG_CUDA_BISECT_BISECT_UTIL_HPP_ -#define VIENNACL_LINALG_CUDA_BISECT_BISECT_UTIL_HPP_ - -/* ========================================================================= - Copyright (c) 2010-2016, Institute for Microelectronics, - Institute for Analysis and Scientific Computing, - TU Wien. - Portions of this software are copyright by UChicago Argonne, LLC. - - ----------------- - ViennaCL - The Vienna Computing Library - ----------------- - - Project Head: Karl Rupp [email protected] - - (A list of authors and contributors can be found in the manual) - - License: MIT (X11), see file LICENSE in the base directory -============================================================================= */ - - -/** @file viennacl/linalg/cuda/bisect_kernel_small.hpp - @brief Utility / shared functionality for bisection kernels - - Implementation based on the sample provided with the CUDA 6.0 SDK, for which - the creation of derivative works is allowed by including the following statement: - "This software contains source code provided by NVIDIA Corporation." -*/ - -// includes, project -#include "viennacl/linalg/detail/bisect/config.hpp" -#include "viennacl/linalg/detail/bisect/util.hpp" - -namespace viennacl -{ -namespace linalg -{ -namespace cuda -{ -//////////////////////////////////////////////////////////////////////////////// -//! Compute the next lower power of two of n -//! @param n number for which next higher power of two is seeked -//////////////////////////////////////////////////////////////////////////////// -__device__ -inline int -floorPow2(int n) -{ - - // early out if already power of two - if (0 == (n & (n-1))) - { - return n; - } - - int exp; - frexp((float)n, &exp); - return (1 << (exp - 1)); -} - -//////////////////////////////////////////////////////////////////////////////// -//! Compute the next higher power of two of n -//! @param n number for which next higher power of two is seeked -//////////////////////////////////////////////////////////////////////////////// -__device__ -inline int -ceilPow2(int n) -{ - - // early out if already power of two - if (0 == (n & (n-1))) - { - return n; - } - - int exp; - frexp((float)n, &exp); - return (1 << exp); -} - -//////////////////////////////////////////////////////////////////////////////// -//! Compute midpoint of interval [\a left, \a right] avoiding overflow if -//! possible -//! @param left left / lower limit of interval -//! @param right right / upper limit of interval -//////////////////////////////////////////////////////////////////////////////// -template<typename NumericT> -__device__ -inline NumericT -computeMidpoint(const NumericT left, const NumericT right) -{ - - NumericT mid; - - if (viennacl::linalg::detail::sign_f(left) == viennacl::linalg::detail::sign_f(right)) - { - mid = left + (right - left) * 0.5f; - } - else - { - mid = (left + right) * 0.5f; - } - - return mid; -} - -//////////////////////////////////////////////////////////////////////////////// -//! Check if interval converged and store appropriately -//! @param addr address where to store the information of the interval -//! @param s_left shared memory storage for left interval limits -//! @param s_right shared memory storage for right interval limits -//! @param s_left_count shared memory storage for number of eigenvalues less -//! than left interval limits -//! @param s_right_count shared memory storage for number of eigenvalues less -//! than right interval limits -//! @param left lower limit of interval -//! @param right upper limit of interval -//! @param left_count eigenvalues less than \a left -//! @param right_count eigenvalues less than \a right -//! @param precision desired precision for eigenvalues -//////////////////////////////////////////////////////////////////////////////// -template<class S, class T, class NumericT> -__device__ -void -storeInterval(unsigned int addr, - NumericT *s_left, NumericT *s_right, - T *s_left_count, T *s_right_count, - NumericT left, NumericT right, - S left_count, S right_count, - NumericT precision) -{ - s_left_count[addr] = left_count; - s_right_count[addr] = right_count; - - // check if interval converged - NumericT t0 = abs(right - left); - NumericT t1 = max(abs(left), abs(right)) * precision; - - if (t0 <= max(static_cast<NumericT>(VIENNACL_BISECT_MIN_ABS_INTERVAL), t1)) - { - // compute mid point - NumericT lambda = computeMidpoint(left, right); - - // mark as converged - s_left[addr] = lambda; - s_right[addr] = lambda; - } - else - { - - // store current limits - s_left[addr] = left; - s_right[addr] = right; - } -} - -//////////////////////////////////////////////////////////////////////////////// -//! Compute number of eigenvalues that are smaller than x given a symmetric, -//! real, and tridiagonal matrix -//! @param g_d diagonal elements stored in global memory -//! @param g_s superdiagonal elements stored in global memory -//! @param n size of matrix -//! @param x value for which the number of eigenvalues that are smaller is -//! seeked -//! @param tid thread identified (e.g. threadIdx.x or gtid) -//! @param num_intervals_active number of active intervals / threads that -//! currently process an interval -//! @param s_d scratch space to store diagonal entries of the tridiagonal -//! matrix in shared memory -//! @param s_s scratch space to store superdiagonal entries of the tridiagonal -//! matrix in shared memory -//! @param converged flag if the current thread is already converged (that -//! is count does not have to be computed) -//////////////////////////////////////////////////////////////////////////////// -template<typename NumericT> -__device__ -inline unsigned int -computeNumSmallerEigenvals(const NumericT *g_d, const NumericT *g_s, const unsigned int n, - const NumericT x, - const unsigned int tid, - const unsigned int num_intervals_active, - NumericT *s_d, NumericT *s_s, - unsigned int converged - ) -{ - - NumericT delta = 1.0f; - unsigned int count = 0; - - __syncthreads(); - - // read data into shared memory - if (threadIdx.x < n) - { - s_d[threadIdx.x] = *(g_d + threadIdx.x); - s_s[threadIdx.x] = *(g_s + threadIdx.x - 1); - } - - __syncthreads(); - - // perform loop only for active threads - if ((tid < num_intervals_active) && (0 == converged)) - { - - // perform (optimized) Gaussian elimination to determine the number - // of eigenvalues that are smaller than n - for (unsigned int k = 0; k < n; ++k) - { - delta = s_d[k] - x - (s_s[k] * s_s[k]) / delta; - count += (delta < 0) ? 1 : 0; - } - - } // end if thread currently processing an interval - - return count; -} -//////////////////////////////////////////////////////////////////////////////// -//! Compute number of eigenvalues that are smaller than x given a symmetric, -//! real, and tridiagonal matrix -//! @param g_d diagonal elements stored in global memory -//! @param g_s superdiagonal elements stored in global memory -//! @param n size of matrix -//! @param x value for which the number of eigenvalues that are smaller is -//! seeked -//! @param tid thread identified (e.g. threadIdx.x or gtid) -//! @param num_intervals_active number of active intervals / threads that -//! currently process an interval -//! @param s_d scratch space to store diagonal entries of the tridiagonal -//! matrix in shared memory -//! @param s_s scratch space to store superdiagonal entries of the tridiagonal -//! matrix in shared memory -//! @param converged flag if the current thread is already converged (that -//! is count does not have to be computed) -//////////////////////////////////////////////////////////////////////////////// -template<typename NumericT> -__device__ -inline unsigned int -computeNumSmallerEigenvalsLarge(const NumericT *g_d, const NumericT *g_s, const unsigned int n, - const NumericT x, - const unsigned int tid, - const unsigned int num_intervals_active, - NumericT *s_d, NumericT *s_s, - unsigned int converged - ) -{ - NumericT delta = 1.0f; - unsigned int count = 0; - - unsigned int rem = n; - - // do until whole diagonal and superdiagonal has been loaded and processed - for (unsigned int i = 0; i < n; i += blockDim.x) - { - - __syncthreads(); - - // read new chunk of data into shared memory - if ((i + threadIdx.x) < n) - { - - s_d[threadIdx.x] = *(g_d + i + threadIdx.x); - s_s[threadIdx.x] = *(g_s + i + threadIdx.x - 1); - } - - __syncthreads(); - - - if (tid < num_intervals_active) - { - - // perform (optimized) Gaussian elimination to determine the number - // of eigenvalues that are smaller than n - for (unsigned int k = 0; k < min(rem,blockDim.x); ++k) - { - delta = s_d[k] - x - (s_s[k] * s_s[k]) / delta; - // delta = (abs( delta) < (1.0e-10)) ? -(1.0e-10) : delta; - count += (delta < 0) ? 1 : 0; - } - - } // end if thread currently processing an interval - - rem -= blockDim.x; - } - - return count; -} - -/** @brief Store all non-empty intervals resulting from the subdivision of the interval currently processed by the thread. -* -* @param addr base address for storing intervals -* @param num_threads_active number of threads / intervals in current sweep -* @param s_left shared memory storage for left interval limits -* @param s_right shared memory storage for right interval limits -* @param s_left_count shared memory storage for number of eigenvalues less than left interval limits -* @param s_right_count shared memory storage for number of eigenvalues less than right interval limits -* @param left lower limit of interval -* @param mid midpoint of interval -* @param right upper limit of interval -* @param left_count eigenvalues less than \a left -* @param mid_count eigenvalues less than \a mid -* @param right_count eigenvalues less than \a right -* @param precision desired precision for eigenvalues -* @param compact_second_chunk shared mem flag if second chunk is used and ergo requires compaction -* @param s_compaction_list_exc helper array for stream compaction, s_compaction_list_exc[tid] = 1 when the thread generated two child intervals -* @param is_active_second mark is thread has a second non-empty child interval -*/ -template<class S, class T, class NumericT> -__device__ -void -storeNonEmptyIntervals(unsigned int addr, - const unsigned int num_threads_active, - NumericT *s_left, NumericT *s_right, - T *s_left_count, T *s_right_count, - NumericT left, NumericT mid, NumericT right, - const S left_count, - const S mid_count, - const S right_count, - NumericT precision, - unsigned int &compact_second_chunk, - T *s_compaction_list_exc, - unsigned int &is_active_second) -{ - // check if both child intervals are valid - - if ((left_count != mid_count) && (mid_count != right_count)) - { - - // store the left interval - storeInterval(addr, s_left, s_right, s_left_count, s_right_count, - left, mid, left_count, mid_count, precision); - - // mark that a second interval has been generated, only stored after - // stream compaction of second chunk - is_active_second = 1; - s_compaction_list_exc[threadIdx.x] = 1; - compact_second_chunk = 1; - } - else - { - - // only one non-empty child interval - - // mark that no second child - is_active_second = 0; - s_compaction_list_exc[threadIdx.x] = 0; - - // store the one valid child interval - if (left_count != mid_count) - { - storeInterval(addr, s_left, s_right, s_left_count, s_right_count, - left, mid, left_count, mid_count, precision); - } - else - { - storeInterval(addr, s_left, s_right, s_left_count, s_right_count, - mid, right, mid_count, right_count, precision); - } - - } -} -//////////////////////////////////////////////////////////////////////////////// -//! Create indices for compaction, that is process \a s_compaction_list_exc -//! which is 1 for intervals that generated a second child and 0 otherwise -//! and create for each of the non-zero elements the index where the new -//! interval belongs to in a compact representation of all generated second -//! childs -//! @param s_compaction_list_exc list containing the flags which threads -//! generated two childs -//! @param num_threads_compaction number of threads to employ for compaction -//////////////////////////////////////////////////////////////////////////////// -template<class T> -__device__ -void -createIndicesCompaction(T *s_compaction_list_exc, - unsigned int num_threads_compaction) -{ - - unsigned int offset = 1; - const unsigned int tid = threadIdx.x; - // if(tid == 0) - // printf("num_threads_compaction = %u\n", num_threads_compaction); - - // higher levels of scan tree - for (int d = (num_threads_compaction >> 1); d > 0; d >>= 1) - { - - __syncthreads(); - - if (tid < d) - { - - unsigned int ai = offset*(2*tid+1)-1; - unsigned int bi = offset*(2*tid+2)-1; - - s_compaction_list_exc[bi] = s_compaction_list_exc[bi] - + s_compaction_list_exc[ai]; - } - - offset <<= 1; - } - - // traverse down tree: first down to level 2 across - for (int d = 2; d < num_threads_compaction; d <<= 1) - { - - offset >>= 1; - __syncthreads(); - - if (tid < (d-1)) - { - - unsigned int ai = offset*(tid+1) - 1; - unsigned int bi = ai + (offset >> 1); - - s_compaction_list_exc[bi] = s_compaction_list_exc[bi] - + s_compaction_list_exc[ai]; - } - } - - __syncthreads(); - -} - -/** @brief Perform stream compaction for second child intervals -* -* @param s_left shared memory storage for left interval limits -* @param s_right shared memory storage for right interval limits -* @param s_left_count shared memory storage for number of eigenvalues less than left interval limits -* @param s_right_count shared memory storage for number of eigenvalues less than right interval limits -* @param mid midpoint of current interval (left of new interval) -* @param right upper limit of interval -* @param mid_count eigenvalues less than mid -* @param right_count eigenvalues less than right -* @param s_compaction_list list containing the indices where the data has to be stored -* @param num_threads_active number of active threads / intervals -* @param is_active_second mark is thread has a second non-empty child interval -*/ -template<class T, class NumericT> -__device__ -void -compactIntervals(NumericT *s_left, NumericT *s_right, - T *s_left_count, T *s_right_count, - NumericT mid, NumericT right, - unsigned int mid_count, unsigned int right_count, - T *s_compaction_list, - unsigned int num_threads_active, - unsigned int is_active_second) -{ - const unsigned int tid = threadIdx.x; - - // perform compaction / copy data for all threads where the second - // child is not dead - if ((tid < num_threads_active) && (1 == is_active_second)) - { - unsigned int addr_w = num_threads_active + s_compaction_list[tid]; - s_left[addr_w] = mid; - s_right[addr_w] = right; - s_left_count[addr_w] = mid_count; - s_right_count[addr_w] = right_count; - } -} - -template<class T, class S, class NumericT> -__device__ -void -storeIntervalConverged(NumericT *s_left, NumericT *s_right, - T *s_left_count, T *s_right_count, - NumericT &left, NumericT &mid, NumericT &right, - S &left_count, S &mid_count, S &right_count, - T *s_compaction_list_exc, - unsigned int &compact_second_chunk, - const unsigned int num_threads_active, - unsigned int &is_active_second) -{ - const unsigned int tid = threadIdx.x; - const unsigned int multiplicity = right_count - left_count; - // check multiplicity of eigenvalue - if (1 == multiplicity) - { - - // just re-store intervals, simple eigenvalue - s_left[tid] = left; - s_right[tid] = right; - s_left_count[tid] = left_count; - s_right_count[tid] = right_count; - - - // mark that no second child / clear - is_active_second = 0; - s_compaction_list_exc[tid] = 0; - } - else - { - - // number of eigenvalues after the split less than mid - mid_count = left_count + (multiplicity >> 1); - - // store left interval - s_left[tid] = left; - s_right[tid] = right; - s_left_count[tid] = left_count; - s_right_count[tid] = mid_count; - mid = left; - - // mark that second child interval exists - is_active_second = 1; - s_compaction_list_exc[tid] = 1; - compact_second_chunk = 1; - } -} - -/** @brief Subdivide interval if active and not already converged. -* -* @param tid id of thread -* @param s_left shared memory storage for left interval limits -* @param s_right shared memory storage for right interval limits -* @param s_left_count shared memory storage for number of eigenvalues less than left interval limits -* @param s_right_count shared memory storage for number of eigenvalues less than right interval limits -* @param num_threads_active number of active threads in warp -* @param left lower limit of interval -* @param right upper limit of interval -* @param left_count eigenvalues less than \a left -* @param right_count eigenvalues less than \a right -* @param mid median of interval -* @param all_threads_converged shared memory flag if all threads are -*/ -template<class T, class NumericT> -__device__ -void -subdivideActiveIntervalMulti(const unsigned int tid, - NumericT *s_left, NumericT *s_right, - T *s_left_count, T *s_right_count, - const unsigned int num_threads_active, - NumericT &left, NumericT &right, - unsigned int &left_count, unsigned int &right_count, - NumericT &mid, unsigned int &all_threads_converged) -{ - // for all active threads - if (tid < num_threads_active) - { - - left = s_left[tid]; - right = s_right[tid]; - left_count = s_left_count[tid]; - right_count = s_right_count[tid]; - - // check if thread already converged - if (left != right) - { - - mid = computeMidpoint(left, right); - all_threads_converged = 0; - } - else if ((right_count - left_count) > 1) - { - // mark as not converged if multiple eigenvalues enclosed - // duplicate interval in storeIntervalsConverged() - all_threads_converged = 0; - } - - } // end for all active threads -} - - -/** @brief Subdivide interval if active and not already converged. -* -* @param tid id of thread -* @param s_left shared memory storage for left interval limits -* @param s_right shared memory storage for right interval limits -* @param s_left_count shared memory storage for number of eigenvalues less than left interval limits -* @param s_right_count shared memory storage for number of eigenvalues less than right interval limits -* @param num_threads_active number of active threads in warp -* @param left lower limit of interval -* @param right upper limit of interval -* @param left_count eigenvalues less than \a left -* @param right_count eigenvalues less than \a right -* @param mid median of interval -* @param all_threads_converged shared memory flag if all threads are -*/ -template<class T, class NumericT> -__device__ -void -subdivideActiveInterval(const unsigned int tid, - NumericT *s_left, NumericT *s_right, - T *s_left_count, T *s_right_count, - const unsigned int num_threads_active, - NumericT &left, NumericT &right, - unsigned int &left_count, unsigned int &right_count, - NumericT &mid, unsigned int &all_threads_converged) -{ - // for all active threads - if (tid < num_threads_active) - { - - left = s_left[tid]; - right = s_right[tid]; - left_count = s_left_count[tid]; - right_count = s_right_count[tid]; - - // check if thread already converged - if (left != right) - { - - mid = computeMidpoint(left, right); - all_threads_converged = 0; - } - } // end for all active threads -} -} -} -} - -#endif // #ifndef VIENNACL_LINALG_DETAIL_BISECT_UTIL_HPP_ - http://git-wip-us.apache.org/repos/asf/mahout/blob/7ae549fa/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/common.hpp ---------------------------------------------------------------------- diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/common.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/common.hpp deleted file mode 100644 index 3622b89..0000000 --- a/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/common.hpp +++ /dev/null @@ -1,250 +0,0 @@ -#ifndef VIENNACL_LINALG_CUDA_COMMON_HPP_ -#define VIENNACL_LINALG_CUDA_COMMON_HPP_ - -/* ========================================================================= - Copyright (c) 2010-2016, Institute for Microelectronics, - Institute for Analysis and Scientific Computing, - TU Wien. - Portions of this software are copyright by UChicago Argonne, LLC. - - ----------------- - ViennaCL - The Vienna Computing Library - ----------------- - - Project Head: Karl Rupp [email protected] - - (A list of authors and contributors can be found in the manual) - - License: MIT (X11), see file LICENSE in the base directory -============================================================================= */ - -/** @file viennacl/linalg/cuda/common.hpp - @brief Common routines for CUDA execution -*/ - -#include <sstream> -#include <cuda_runtime.h> -#include "viennacl/backend/cuda.hpp" -#include "viennacl/traits/handle.hpp" - -#define VIENNACL_CUDA_LAST_ERROR_CHECK(message) detail::cuda_last_error_check (message, __FILE__, __LINE__) - -namespace viennacl -{ - -////// scalar - -/** @brief Convenience helper function for extracting the CUDA handle from a ViennaCL scalar. Non-const version. */ -template<typename NumericT> -NumericT * cuda_arg(scalar<NumericT> & obj) -{ - return reinterpret_cast<NumericT *>(viennacl::traits::handle(obj).cuda_handle().get()); -} - -/** @brief Convenience helper function for extracting the CUDA handle from a ViennaCL scalar. Const version. */ -template<typename NumericT> -const NumericT * cuda_arg(scalar<NumericT> const & obj) -{ - return reinterpret_cast<const NumericT *>(viennacl::traits::handle(obj).cuda_handle().get()); -} - - -////// vector_base - -/** @brief Convenience helper function for extracting the CUDA handle from a ViennaCL vector (through the base class vector_base) with implicit return type deduction. Non-const version. */ -template<typename NumericT> -NumericT * cuda_arg(vector_base<NumericT> & obj) -{ - return reinterpret_cast<NumericT *>(viennacl::traits::handle(obj).cuda_handle().get()); -} - -/** @brief Convenience helper function for extracting the CUDA handle from a ViennaCL vector (through the base class vector_base) with implicit return type deduction. Const version. */ -template<typename NumericT> -const NumericT * cuda_arg(vector_base<NumericT> const & obj) -{ - return reinterpret_cast<const NumericT *>(viennacl::traits::handle(obj).cuda_handle().get()); -} - -/** @brief Convenience helper function for extracting the CUDA handle from a ViennaCL vector (through the base class vector_base). Return type needs to be explicitly provided as first template argument. Non-const version. */ -template<typename ReturnT, typename NumericT> -ReturnT * cuda_arg(vector_base<NumericT> & obj) -{ - return reinterpret_cast<ReturnT *>(viennacl::traits::handle(obj).cuda_handle().get()); -} - -/** @brief Convenience helper function for extracting the CUDA handle from a ViennaCL vector (through the base class vector_base). Return type needs to be explicitly provided as first template argument. Const version. */ -template<typename ReturnT, typename NumericT> -const ReturnT * cuda_arg(vector_base<NumericT> const & obj) -{ - return reinterpret_cast<const ReturnT *>(viennacl::traits::handle(obj).cuda_handle().get()); -} - - -////// matrix_base - -/** @brief Convenience helper function for extracting the CUDA handle from a ViennaCL matrix (through the base class matrix_base). Non-const version. */ -template<typename NumericT> -NumericT * cuda_arg(matrix_base<NumericT> & obj) -{ - return reinterpret_cast<NumericT *>(viennacl::traits::handle(obj).cuda_handle().get()); -} - -/** @brief Convenience helper function for extracting the CUDA handle from a ViennaCL matrix (through the base class matrix_base). Const version. */ -template<typename NumericT> -const NumericT * cuda_arg(matrix_base<NumericT> const & obj) -{ - return reinterpret_cast<const NumericT *>(viennacl::traits::handle(obj).cuda_handle().get()); -} - - - -////// mem_handle - - -/** @brief Convenience helper function for extracting the CUDA handle from a generic memory handle. Non-const version. */ -template<typename ReturnT> -ReturnT * cuda_arg(viennacl::backend::mem_handle & h) -{ - return reinterpret_cast<ReturnT *>(h.cuda_handle().get()); -} - -/** @brief Convenience helper function for extracting the CUDA handle from a generic memory handle. Const-version. */ -template<typename ReturnT> -ReturnT const * cuda_arg(viennacl::backend::mem_handle const & h) -{ - return reinterpret_cast<const ReturnT *>(h.cuda_handle().get()); -} - -/** \cond **/ -template<typename ReturnT> -ReturnT * cuda_arg(viennacl::backend::mem_handle::cuda_handle_type & h) -{ - return reinterpret_cast<ReturnT *>(h.get()); -} - -template<typename ReturnT> -ReturnT const * cuda_arg(viennacl::backend::mem_handle::cuda_handle_type const & h) -{ - return reinterpret_cast<const ReturnT *>(h.get()); -} - -inline unsigned int cuda_arg(unsigned int val) { return val; } - -template<typename NumericT> char cuda_arg(char val) { return val; } -template<typename NumericT> unsigned char cuda_arg(unsigned char val) { return val; } - -template<typename NumericT> short cuda_arg(short val) { return val; } -template<typename NumericT> unsigned short cuda_arg(unsigned short val) { return val; } - -template<typename NumericT> int cuda_arg(int val) { return val; } -template<typename NumericT> unsigned int cuda_arg(unsigned int val) { return val; } - -template<typename NumericT> long cuda_arg(long val) { return val; } -template<typename NumericT> unsigned long cuda_arg(unsigned long val) { return val; } - -template<typename NumericT> float cuda_arg(float val) { return val; } -template<typename NumericT> double cuda_arg(double val) { return val; } - -/** \endcond */ - - -namespace linalg -{ -namespace cuda -{ - - -namespace detail -{ - -inline unsigned int make_options(vcl_size_t length, bool reciprocal, bool flip_sign) -{ - return static_cast<unsigned int>( ((length > 1) ? (static_cast<unsigned int>(length) << 2) : 0) + (reciprocal ? 2 : 0) + (flip_sign ? 1 : 0) ); -} - -inline void cuda_last_error_check(const char * message, const char * file, const int line ) -{ - cudaError_t error_code = cudaGetLastError(); - - if (cudaSuccess != error_code) - { - std::stringstream ss; - ss << file << "(" << line << "): " << ": getLastCudaError() CUDA error " << error_code << ": " << cudaGetErrorString( error_code ) << " @ " << message << std::endl; - throw viennacl::backend::cuda::cuda_exception(ss.str(), error_code); - } -} - -template<typename NumericT> -struct type_to_type2; - -template<> -struct type_to_type2<float> { typedef float2 type; }; - -template<> -struct type_to_type2<double> { typedef double2 type; }; - - -template<typename NumericT, typename OtherT> -typename viennacl::backend::mem_handle::cuda_handle_type & arg_reference(viennacl::scalar<NumericT> & s, OtherT) { return s.handle().cuda_handle(); } - -template<typename NumericT, typename OtherT> -typename viennacl::backend::mem_handle::cuda_handle_type const & arg_reference(viennacl::scalar<NumericT> const & s, OtherT) { return s.handle().cuda_handle(); } - -// all other cases where T is not a ViennaCL scalar -template<typename ArgT> -typename viennacl::enable_if< viennacl::is_cpu_scalar<ArgT>::value, - char const &>::type -arg_reference(ArgT, char const & val) { return val; } - -template<typename ArgT> -typename viennacl::enable_if< viennacl::is_cpu_scalar<ArgT>::value, - unsigned char const &>::type -arg_reference(ArgT, unsigned char const & val) { return val; } - -template<typename ArgT> -typename viennacl::enable_if< viennacl::is_cpu_scalar<ArgT>::value, - short const &>::type -arg_reference(ArgT, short const & val) { return val; } - -template<typename ArgT> -typename viennacl::enable_if< viennacl::is_cpu_scalar<ArgT>::value, - unsigned short const &>::type -arg_reference(ArgT, unsigned short const & val) { return val; } - -template<typename ArgT> -typename viennacl::enable_if< viennacl::is_cpu_scalar<ArgT>::value, - int const &>::type -arg_reference(ArgT, int const & val) { return val; } - -template<typename ArgT> -typename viennacl::enable_if< viennacl::is_cpu_scalar<ArgT>::value, - unsigned int const &>::type -arg_reference(ArgT, unsigned int const & val) { return val; } - -template<typename ArgT> -typename viennacl::enable_if< viennacl::is_cpu_scalar<ArgT>::value, - long const &>::type -arg_reference(ArgT, long const & val) { return val; } - -template<typename ArgT> -typename viennacl::enable_if< viennacl::is_cpu_scalar<ArgT>::value, - unsigned long const &>::type -arg_reference(ArgT, unsigned long const & val) { return val; } - -template<typename ArgT> -typename viennacl::enable_if< viennacl::is_cpu_scalar<ArgT>::value, - float const &>::type -arg_reference(ArgT, float const & val) { return val; } - -template<typename ArgT> -typename viennacl::enable_if< viennacl::is_cpu_scalar<ArgT>::value, - double const &>::type -arg_reference(ArgT, double const & val) { return val; } - -} //namespace detail -} //namespace cuda -} //namespace linalg -} //namespace viennacl - - -#endif http://git-wip-us.apache.org/repos/asf/mahout/blob/7ae549fa/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/direct_solve.hpp ---------------------------------------------------------------------- diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/direct_solve.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/direct_solve.hpp deleted file mode 100644 index ae70f9a..0000000 --- a/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/direct_solve.hpp +++ /dev/null @@ -1,412 +0,0 @@ -#ifndef VIENNACL_LINALG_CUDA_DIRECT_SOLVE_HPP -#define VIENNACL_LINALG_CUDA_DIRECT_SOLVE_HPP - -/* ========================================================================= - Copyright (c) 2010-2016, Institute for Microelectronics, - Institute for Analysis and Scientific Computing, - TU Wien. - Portions of this software are copyright by UChicago Argonne, LLC. - - ----------------- - ViennaCL - The Vienna Computing Library - ----------------- - - Project Head: Karl Rupp [email protected] - - (A list of authors and contributors can be found in the manual) - - License: MIT (X11), see file LICENSE in the base directory -============================================================================= */ - -/** @file viennacl/linalg/cuda/direct_solve.hpp - @brief Implementations of dense direct solvers using CUDA are found here. -*/ - -#include "viennacl/forwards.h" -#include "viennacl/vector.hpp" -#include "viennacl/matrix.hpp" - - -#include "viennacl/linalg/cuda/common.hpp" - - -namespace viennacl -{ -namespace linalg -{ -namespace cuda -{ - -template<typename NumericT> -__global__ void matrix_matrix_upper_solve_kernel( - const NumericT * A, - unsigned int A_start1, unsigned int A_start2, - unsigned int A_inc1, unsigned int A_inc2, - unsigned int A_size1, unsigned int A_size2, - unsigned int A_internal_size1, unsigned int A_internal_size2, - bool row_major_A, - - NumericT * B, - unsigned int B_start1, unsigned int B_start2, - unsigned int B_inc1, unsigned int B_inc2, - unsigned int B_size1, unsigned int B_size2, - unsigned int B_internal_size1, unsigned int B_internal_size2, - bool row_major_B, - - bool unit_diagonal) -{ - NumericT temp; - NumericT entry_A; - - for (unsigned int row_cnt = 0; row_cnt < A_size1; ++row_cnt) - { - unsigned int row = A_size1 - 1 - row_cnt; - - if (!unit_diagonal) - { - __syncthreads(); - - if (threadIdx.x == 0) - { - if (row_major_B) - B[(row * B_inc1 + B_start1) * B_internal_size2 + (blockIdx.x * B_inc2 + B_start2)] /= (row_major_A) ? A[(row * A_inc1 + A_start1) * A_internal_size2 + (row * A_inc2 + A_start2)] - : A[(row * A_inc1 + A_start1) + (row * A_inc2 + A_start2)*A_internal_size1]; - else //if (!row_major_B) - B[(row * B_inc1 + B_start1) + (blockIdx.x * B_inc2 + B_start2) * B_internal_size1] /= (row_major_A) ? A[(row * A_inc1 + A_start1) * A_internal_size2 + (row * A_inc2 + A_start2)] - : A[(row * A_inc1 + A_start1) + (row * A_inc2 + A_start2)*A_internal_size1]; - } - } - - __syncthreads(); - - if (row_major_B) - temp = B[(row * B_inc1 + B_start1) * B_internal_size2 + (blockIdx.x * B_inc2 + B_start2)]; - else //if (!row_major_B) - temp = B[(row * B_inc1 + B_start1) + (blockIdx.x * B_inc2 + B_start2) * B_internal_size1]; - - //eliminate column of op(A) with index 'row' in parallel: " << std::endl; - for (unsigned int elim = threadIdx.x; elim < row; elim += blockDim.x) - { - if (row_major_A) - entry_A = A[(elim * A_inc1 + A_start1) * A_internal_size2 + (row * A_inc2 + A_start2)]; - else //if (!row_major_A) - entry_A = A[(elim * A_inc1 + A_start1) + (row * A_inc2 + A_start2) * A_internal_size1]; - - if (row_major_B) - B[(elim * B_inc1 + B_start1) * B_internal_size2 + (blockIdx.x * B_inc2 + B_start2)] -= temp * entry_A; - else //if (!row_major_B) - B[(elim * B_inc1 + B_start1) + (blockIdx.x * B_inc2 + B_start2) * B_internal_size1] -= temp * entry_A; - - } - } -} - - - -template<typename NumericT> -__global__ void matrix_matrix_lower_solve_kernel( - const NumericT * A, - unsigned int A_start1, unsigned int A_start2, - unsigned int A_inc1, unsigned int A_inc2, - unsigned int A_size1, unsigned int A_size2, - unsigned int A_internal_size1, unsigned int A_internal_size2, - bool row_major_A, - - NumericT * B, - unsigned int B_start1, unsigned int B_start2, - unsigned int B_inc1, unsigned int B_inc2, - unsigned int B_size1, unsigned int B_size2, - unsigned int B_internal_size1, unsigned int B_internal_size2, - bool row_major_B, - - bool unit_diagonal) -{ - NumericT temp; - NumericT entry_A; - - for (unsigned int row = 0; row < A_size1; ++row) - { - - if (!unit_diagonal) - { - __syncthreads(); - - if (threadIdx.x == 0) - { - if (row_major_B) - B[(row * B_inc1 + B_start1) * B_internal_size2 + (blockIdx.x * B_inc2 + B_start2)] /= (row_major_A) ? A[(row * A_inc1 + A_start1) * A_internal_size2 + (row * A_inc2 + A_start2)] - : A[(row * A_inc1 + A_start1) + (row * A_inc2 + A_start2)*A_internal_size1]; - else //if (!row_major_B) - B[(row * B_inc1 + B_start1) + (blockIdx.x * B_inc2 + B_start2) * B_internal_size1] /= (row_major_A) ? A[(row * A_inc1 + A_start1) * A_internal_size2 + (row * A_inc2 + A_start2)] - : A[(row * A_inc1 + A_start1) + (row * A_inc2 + A_start2)*A_internal_size1]; - } - } - - __syncthreads(); - - if (row_major_B) - temp = B[(row * B_inc1 + B_start1) * B_internal_size2 + (blockIdx.x * B_inc2 + B_start2)]; - else //if (!row_major_B) - temp = B[(row * B_inc1 + B_start1) + (blockIdx.x * B_inc2 + B_start2) * B_internal_size1]; - - //eliminate column of op(A) with index 'row' in parallel: " << std::endl; - for (unsigned int elim = row + threadIdx.x + 1; elim < A_size1; elim += blockDim.x) - { - if (row_major_A) - entry_A = A[(elim * A_inc1 + A_start1) * A_internal_size2 + (row * A_inc2 + A_start2)]; - else //if (!row_major_A) - entry_A = A[(elim * A_inc1 + A_start1) + (row * A_inc2 + A_start2) * A_internal_size1]; - - if (row_major_B) - B[(elim * B_inc1 + B_start1) * B_internal_size2 + (blockIdx.x * B_inc2 + B_start2)] -= temp * entry_A; - else //if (!row_major_B) - B[(elim * B_inc1 + B_start1) + (blockIdx.x * B_inc2 + B_start2) * B_internal_size1] -= temp * entry_A; - - } - } -} - - - - - - -namespace detail -{ - template<typename TagT> - bool is_unit_solve(TagT const & tag) { return false; } - - inline bool is_unit_solve(viennacl::linalg::unit_lower_tag) { return true; } - inline bool is_unit_solve(viennacl::linalg::unit_upper_tag) { return true; } - - template<typename TagT> - bool is_upper_solve(TagT const & tag) { return false; } - - inline bool is_upper_solve(viennacl::linalg::upper_tag) { return true; } - inline bool is_upper_solve(viennacl::linalg::unit_upper_tag) { return true; } - - template<typename Matrix1T, typename Matrix2T, typename SolverTagT> - void inplace_solve_impl(Matrix1T const & A, - Matrix2T & B, - SolverTagT const & tag) - { - typedef typename viennacl::result_of::cpu_value_type<Matrix1T>::type value_type; - - dim3 threads(128); - dim3 grid(B.size2()); - - if (is_upper_solve(tag)) - { - matrix_matrix_upper_solve_kernel<<<grid,threads>>>(viennacl::cuda_arg(A), - static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)), - static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)), - static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)), - static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)), - bool(A.row_major()), - - viennacl::cuda_arg(B), - static_cast<unsigned int>(viennacl::traits::start1(B)), static_cast<unsigned int>(viennacl::traits::start2(B)), - static_cast<unsigned int>(viennacl::traits::stride1(B)), static_cast<unsigned int>(viennacl::traits::stride2(B)), - static_cast<unsigned int>(viennacl::traits::size1(B)), static_cast<unsigned int>(viennacl::traits::size2(B)), - static_cast<unsigned int>(viennacl::traits::internal_size1(B)), static_cast<unsigned int>(viennacl::traits::internal_size2(B)), - bool(B.row_major()), - - is_unit_solve(tag) - ); - } - else - { - matrix_matrix_lower_solve_kernel<<<grid,threads>>>(viennacl::cuda_arg(A), - static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)), - static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)), - static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)), - static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)), - bool(A.row_major()), - - viennacl::cuda_arg(B), - static_cast<unsigned int>(viennacl::traits::start1(B)), static_cast<unsigned int>(viennacl::traits::start2(B)), - static_cast<unsigned int>(viennacl::traits::stride1(B)), static_cast<unsigned int>(viennacl::traits::stride2(B)), - static_cast<unsigned int>(viennacl::traits::size1(B)), static_cast<unsigned int>(viennacl::traits::size2(B)), - static_cast<unsigned int>(viennacl::traits::internal_size1(B)), static_cast<unsigned int>(viennacl::traits::internal_size2(B)), - bool(B.row_major()), - - is_unit_solve(tag) - ); - } - - } -} - - -// -// Note: By convention, all size checks are performed in the calling frontend. No need to double-check here. -// - -////////////////// triangular solver ////////////////////////////////////// -/** @brief Direct inplace solver for triangular systems with multiple right hand sides, i.e. A \ B (MATLAB notation). -* -* @param A The system matrix -* @param B The matrix of row vectors, where the solution is directly written to -* @param tag Solver tag for identifying the respective triangular solver -*/ -template<typename NumericT, typename SolverTagT> -void inplace_solve(matrix_base<NumericT> const & A, - matrix_base<NumericT> & B, - SolverTagT tag) -{ - detail::inplace_solve_impl(A, B, tag); -} - - -// -// Solve on vector -// - -template<typename NumericT> -__global__ void triangular_substitute_inplace_row_kernel( - NumericT const * A, - unsigned int A_start1, unsigned int A_start2, - unsigned int A_inc1, unsigned int A_inc2, - unsigned int A_size1, unsigned int A_size2, - unsigned int A_internal_size1, unsigned int A_internal_size2, - NumericT * v, - unsigned int v_start, - unsigned int v_inc, - unsigned int v_size, - - unsigned int options) -{ - NumericT temp; - unsigned int unit_diagonal_flag = (options & (1 << 0)); - - unsigned int is_lower_solve = (options & (1 << 2)); - unsigned int row; - for (unsigned int rows_processed = 0; rows_processed < A_size1; ++rows_processed) //Note: A required to be square - { - row = is_lower_solve ? rows_processed : ((A_size1 - rows_processed) - 1); - if (!unit_diagonal_flag) - { - __syncthreads(); - if (threadIdx.x == 0) - v[row * v_inc + v_start] /= A[(row * A_inc1 + A_start1) * A_internal_size2 + (row * A_inc2 + A_start2)]; - } - - __syncthreads(); - - temp = v[row * v_inc + v_start]; - - for (int elim = (is_lower_solve ? (row + threadIdx.x + 1) : threadIdx.x); - elim < (is_lower_solve ? A_size1 : row); - elim += blockDim.x) - v[elim * v_inc + v_start] -= temp * A[(elim * A_inc1 + A_start1) * A_internal_size2 + (row * A_inc2 + A_start2)]; - } -} - - -template<typename NumericT> -__global__ void triangular_substitute_inplace_col_kernel( - NumericT const * A, - unsigned int A_start1, unsigned int A_start2, - unsigned int A_inc1, unsigned int A_inc2, - unsigned int A_size1, unsigned int A_size2, - unsigned int A_internal_size1, unsigned int A_internal_size2, - NumericT * v, - unsigned int v_start, - unsigned int v_inc, - unsigned int v_size, - unsigned int options) -{ - NumericT temp; - unsigned int unit_diagonal_flag = (options & (1 << 0)); - - unsigned int is_lower_solve = (options & (1 << 2)); - unsigned int row; - for (unsigned int rows_processed = 0; rows_processed < A_size1; ++rows_processed) //Note: A required to be square - { - row = is_lower_solve ? rows_processed : ((A_size1 - rows_processed) - 1); - if (!unit_diagonal_flag) - { - __syncthreads(); - if (threadIdx.x == 0) - v[row * v_inc + v_start] /= A[(row * A_inc1 + A_start1) + (row * A_inc2 + A_start2) * A_internal_size1]; - } - - __syncthreads(); - - temp = v[row * v_inc + v_start]; - - for (int elim = (is_lower_solve ? (row + threadIdx.x + 1) : threadIdx.x); - elim < (is_lower_solve ? A_size1 : row); - elim += blockDim.x) - v[elim * v_inc + v_start] -= temp * A[(elim * A_inc1 + A_start1) + (row * A_inc2 + A_start2) * A_internal_size1]; - } -} - - -namespace detail -{ - inline unsigned int get_option_for_solver_tag(viennacl::linalg::upper_tag) { return 0; } - inline unsigned int get_option_for_solver_tag(viennacl::linalg::unit_upper_tag) { return (1 << 0); } - inline unsigned int get_option_for_solver_tag(viennacl::linalg::lower_tag) { return (1 << 2); } - inline unsigned int get_option_for_solver_tag(viennacl::linalg::unit_lower_tag) { return (1 << 2) | (1 << 0); } - - template<typename MatrixT, typename VectorT> - void inplace_solve_vector_impl(MatrixT const & mat, - VectorT & vec, - unsigned int options) - { - typedef typename viennacl::result_of::cpu_value_type<MatrixT>::type value_type; - - if (mat.row_major()) - { - triangular_substitute_inplace_row_kernel<<<1, 128>>>(viennacl::cuda_arg(mat), - static_cast<unsigned int>(viennacl::traits::start1(mat)), static_cast<unsigned int>(viennacl::traits::start2(mat)), - static_cast<unsigned int>(viennacl::traits::stride1(mat)), static_cast<unsigned int>(viennacl::traits::stride2(mat)), - static_cast<unsigned int>(viennacl::traits::size1(mat)), static_cast<unsigned int>(viennacl::traits::size2(mat)), - static_cast<unsigned int>(viennacl::traits::internal_size1(mat)), static_cast<unsigned int>(viennacl::traits::internal_size2(mat)), - viennacl::cuda_arg(vec), - static_cast<unsigned int>(viennacl::traits::start(vec)), - static_cast<unsigned int>(viennacl::traits::stride(vec)), - static_cast<unsigned int>(viennacl::traits::size(vec)), - options - ); - } - else - { - triangular_substitute_inplace_col_kernel<<<1, 128>>>(viennacl::cuda_arg(mat), - static_cast<unsigned int>(viennacl::traits::start1(mat)), static_cast<unsigned int>(viennacl::traits::start2(mat)), - static_cast<unsigned int>(viennacl::traits::stride1(mat)), static_cast<unsigned int>(viennacl::traits::stride2(mat)), - static_cast<unsigned int>(viennacl::traits::size1(mat)), static_cast<unsigned int>(viennacl::traits::size2(mat)), - static_cast<unsigned int>(viennacl::traits::internal_size1(mat)), static_cast<unsigned int>(viennacl::traits::internal_size2(mat)), - viennacl::cuda_arg(vec), - static_cast<unsigned int>(viennacl::traits::start(vec)), - static_cast<unsigned int>(viennacl::traits::stride(vec)), - static_cast<unsigned int>(viennacl::traits::size(vec)), - options - ); - } - } - -} - -/** @brief Direct inplace solver for dense triangular systems (non-transposed version) -* -* @param mat The system matrix proxy -* @param vec The load vector, where the solution is directly written to -*/ -template<typename NumericT, typename SolverTagT> -void inplace_solve(matrix_base<NumericT> const & mat, - vector_base<NumericT> & vec, - SolverTagT) -{ - unsigned int options = detail::get_option_for_solver_tag(SolverTagT()); - - detail::inplace_solve_vector_impl(mat, vec, options); -} - - -} -} -} - -#endif
