http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/bisect_kernel_large_onei.hpp ---------------------------------------------------------------------- diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/bisect_kernel_large_onei.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/bisect_kernel_large_onei.hpp new file mode 100755 index 0000000..960f5c2 --- /dev/null +++ b/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/bisect_kernel_large_onei.hpp @@ -0,0 +1,180 @@ +#ifndef VIENNACL_LINALG_CUDA_BISECT_KERNEL_LARGE_ONEI_HPP_ +#define VIENNACL_LINALG_CUDA_BISECT_KERNEL_LARGE_ONEI_HPP_ + +/* ========================================================================= + Copyright (c) 2010-2016, Institute for Microelectronics, + Institute for Analysis and Scientific Computing, + TU Wien. + Portions of this software are copyright by UChicago Argonne, LLC. + + ----------------- + ViennaCL - The Vienna Computing Library + ----------------- + + Project Head: Karl Rupp [email protected] + + (A list of authors and contributors can be found in the manual) + + License: MIT (X11), see file LICENSE in the base directory +============================================================================= */ + + +/** @file viennacl/linalg/cuda/bisect_kernel_large_onei.hpp + @brief Determine eigenvalues for large matrices for intervals that contained after the first step one eigenvalue + + Implementation based on the sample provided with the CUDA 6.0 SDK, for which + the creation of derivative works is allowed by including the following statement: + "This software contains source code provided by NVIDIA Corporation." +*/ + +// includes, project +#include "viennacl/linalg/detail/bisect/config.hpp" +#include "viennacl/linalg/detail/bisect/util.hpp" +// additional kernel +#include "viennacl/linalg/cuda/bisect_util.hpp" + +namespace viennacl +{ +namespace linalg +{ +namespace cuda +{ +//////////////////////////////////////////////////////////////////////////////// +//! Determine eigenvalues for large matrices for intervals that after +//! the first step contained one eigenvalue +//! @param g_d diagonal elements of symmetric, tridiagonal matrix +//! @param g_s superdiagonal elements of symmetric, tridiagonal matrix +//! @param n matrix size +//! @param num_intervals total number of intervals containing one eigenvalue +//! after the first step +//! @param g_left left interval limits +//! @param g_right right interval limits +//! @param g_pos index of interval / number of intervals that are smaller than +//! right interval limit +//! @param precision desired precision of eigenvalues +//////////////////////////////////////////////////////////////////////////////// +template<typename NumericT> +__global__ +void +bisectKernelLarge_OneIntervals(const NumericT *g_d, const NumericT *g_s, const unsigned int n, + unsigned int num_intervals, + NumericT *g_left, NumericT *g_right, + unsigned int *g_pos, + NumericT precision) +{ + + const unsigned int gtid = (blockDim.x * blockIdx.x) + threadIdx.x; + + __shared__ NumericT s_left_scratch[VIENNACL_BISECT_MAX_THREADS_BLOCK]; + __shared__ NumericT s_right_scratch[VIENNACL_BISECT_MAX_THREADS_BLOCK]; + + // active interval of thread + // left and right limit of current interval + NumericT left, right; + // number of threads smaller than the right limit (also corresponds to the + // global index of the eigenvalues contained in the active interval) + unsigned int right_count; + // flag if current thread converged + unsigned int converged = 0; + // midpoint when current interval is subdivided + NumericT mid = 0.0f; + // number of eigenvalues less than mid + unsigned int mid_count = 0; + + // read data from global memory + if (gtid < num_intervals) + { + left = g_left[gtid]; + right = g_right[gtid]; + right_count = g_pos[gtid]; + } + + + // flag to determine if all threads converged to eigenvalue + __shared__ unsigned int converged_all_threads; + + // initialized shared flag + if (0 == threadIdx.x) + { + converged_all_threads = 0; + } + + __syncthreads(); + + // process until all threads converged to an eigenvalue + while (true) + { + + converged_all_threads = 1; + + // update midpoint for all active threads + if ((gtid < num_intervals) && (0 == converged)) + { + mid = computeMidpoint(left, right); + } + + // find number of eigenvalues that are smaller than midpoint + mid_count = computeNumSmallerEigenvalsLarge(g_d, g_s, n, + mid, gtid, num_intervals, + s_left_scratch, + s_right_scratch, + converged); + + __syncthreads(); + + // for all active threads + if ((gtid < num_intervals) && (0 == converged)) + { + + // update intervals -- always one child interval survives + if (right_count == mid_count) + { + right = mid; + } + else + { + left = mid; + } + + // check for convergence + NumericT t0 = right - left; + NumericT t1 = max(abs(right), abs(left)) * precision; + + if (t0 < min(precision, t1)) + { + NumericT lambda = computeMidpoint(left, right); + left = lambda; + right = lambda; + + converged = 1; + } + else + { + converged_all_threads = 0; + } + } + + __syncthreads(); + + if (1 == converged_all_threads) + { + break; + } + + __syncthreads(); + } + + // write data back to global memory + __syncthreads(); + + if (gtid < num_intervals) + { + // intervals converged so left and right interval limit are both identical + // and identical to the eigenvalue + g_left[gtid] = left; + } +} +} // namespace cuda +} // namespace linalg +} // namespace viennacl +#endif // #ifndef VIENNACL_LINALG_CUDA_BISECT_KERNEL_LARGE_ONEI_HPP_
http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/bisect_kernel_small.hpp ---------------------------------------------------------------------- diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/bisect_kernel_small.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/bisect_kernel_small.hpp new file mode 100755 index 0000000..310b381 --- /dev/null +++ b/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/bisect_kernel_small.hpp @@ -0,0 +1,261 @@ +#ifndef VIENNACL_LINALG_CUDA_BISECT_KERNEL_SMALL_HPP_ +#define VIENNACL_LINALG_CUDA_BISECT_KERNEL_SMALL_HPP_ + +/* ========================================================================= + Copyright (c) 2010-2016, Institute for Microelectronics, + Institute for Analysis and Scientific Computing, + TU Wien. + Portions of this software are copyright by UChicago Argonne, LLC. + + ----------------- + ViennaCL - The Vienna Computing Library + ----------------- + + Project Head: Karl Rupp [email protected] + + (A list of authors and contributors can be found in the manual) + + License: MIT (X11), see file LICENSE in the base directory +============================================================================= */ + + +/** @file viennacl/linalg/cuda/bisect_kernel_small.hpp + @brief Determine eigenvalues for small symmetric, tridiagonal matrix + + Implementation based on the sample provided with the CUDA 6.0 SDK, for which + the creation of derivative works is allowed by including the following statement: + "This software contains source code provided by NVIDIA Corporation." +*/ + +// includes, project +#include "viennacl/linalg/detail/bisect/config.hpp" +#include "viennacl/linalg/detail/bisect/util.hpp" +// additional kernel +#include "viennacl/linalg/cuda/bisect_util.hpp" + +namespace viennacl +{ +namespace linalg +{ +namespace cuda +{ + +/** @brief Bisection to find eigenvalues of a real, symmetric, and tridiagonal matrix +* +* @param g_d diagonal elements in global memory +* @param g_s superdiagonal elements in global elements (stored so that the element *(g_s - 1) can be accessed an equals 0 +* @param n size of matrix +* @param g_left helper array +* @param g_right helper array +* @param g_left_count helper array +* @param g_right_count helper array +* @param lg lower bound of input interval (e.g. Gerschgorin interval) +* @param ug upper bound of input interval (e.g. Gerschgorin interval) +* @param lg_eig_count number of eigenvalues that are smaller than lg +* @param ug_eig_count number of eigenvalues that are smaller than lu +* @param epsilon desired accuracy of eigenvalues to compute +*/ +template<typename NumericT> +__global__ +void +bisectKernelSmall(const NumericT *g_d, const NumericT *g_s, const unsigned int n, + NumericT * g_left, NumericT *g_right, + unsigned int *g_left_count, unsigned int *g_right_count, + const NumericT lg, const NumericT ug, + const unsigned int lg_eig_count, const unsigned int ug_eig_count, + NumericT epsilon + ) +{ + // intervals (store left and right because the subdivision tree is in general + // not dense + __shared__ NumericT s_left[VIENNACL_BISECT_MAX_THREADS_BLOCK_SMALL_MATRIX]; + __shared__ NumericT s_right[VIENNACL_BISECT_MAX_THREADS_BLOCK_SMALL_MATRIX]; + + // number of eigenvalues that are smaller than s_left / s_right + // (correspondence is realized via indices) + __shared__ unsigned int s_left_count[VIENNACL_BISECT_MAX_THREADS_BLOCK_SMALL_MATRIX]; + __shared__ unsigned int s_right_count[VIENNACL_BISECT_MAX_THREADS_BLOCK_SMALL_MATRIX]; + + // helper for stream compaction + __shared__ unsigned int + s_compaction_list[VIENNACL_BISECT_MAX_THREADS_BLOCK_SMALL_MATRIX + 1]; + + // state variables for whole block + // if 0 then compaction of second chunk of child intervals is not necessary + // (because all intervals had exactly one non-dead child) + __shared__ unsigned int compact_second_chunk; + __shared__ unsigned int all_threads_converged; + + // number of currently active threads + __shared__ unsigned int num_threads_active; + + // number of threads to use for stream compaction + __shared__ unsigned int num_threads_compaction; + + // helper for exclusive scan + unsigned int *s_compaction_list_exc = s_compaction_list + 1; + + + // variables for currently processed interval + // left and right limit of active interval + NumericT left = 0.0f; + NumericT right = 0.0f; + unsigned int left_count = 0; + unsigned int right_count = 0; + // midpoint of active interval + NumericT mid = 0.0f; + // number of eigenvalues smaller then mid + unsigned int mid_count = 0; + // affected from compaction + unsigned int is_active_second = 0; + + s_compaction_list[threadIdx.x] = 0; + s_left[threadIdx.x] = 0; + s_right[threadIdx.x] = 0; + s_left_count[threadIdx.x] = 0; + s_right_count[threadIdx.x] = 0; + + __syncthreads(); + + // set up initial configuration + if (0 == threadIdx.x) + { + s_left[0] = lg; + s_right[0] = ug; + s_left_count[0] = lg_eig_count; + s_right_count[0] = ug_eig_count; + + compact_second_chunk = 0; + num_threads_active = 1; + + num_threads_compaction = 1; + } + + // for all active threads read intervals from the last level + // the number of (worst case) active threads per level l is 2^l + while (true) + { + + all_threads_converged = 1; + __syncthreads(); + + is_active_second = 0; + subdivideActiveIntervalMulti(threadIdx.x, + s_left, s_right, s_left_count, s_right_count, + num_threads_active, + left, right, left_count, right_count, + mid, all_threads_converged); + + __syncthreads(); + + // check if done + if (1 == all_threads_converged) + { + break; + } + + __syncthreads(); + + // compute number of eigenvalues smaller than mid + // use all threads for reading the necessary matrix data from global + // memory + // use s_left and s_right as scratch space for diagonal and + // superdiagonal of matrix + mid_count = computeNumSmallerEigenvals(g_d, g_s, n, mid, + threadIdx.x, num_threads_active, + s_left, s_right, + (left == right)); + + __syncthreads(); + + // store intervals + // for all threads store the first child interval in a continuous chunk of + // memory, and the second child interval -- if it exists -- in a second + // chunk; it is likely that all threads reach convergence up to + // \a epsilon at the same level; furthermore, for higher level most / all + // threads will have only one child, storing the first child compactly will + // (first) avoid to perform a compaction step on the first chunk, (second) + // make it for higher levels (when all threads / intervals have + // exactly one child) unnecessary to perform a compaction of the second + // chunk + if (threadIdx.x < num_threads_active) + { + + if (left != right) + { + + // store intervals + storeNonEmptyIntervals(threadIdx.x, num_threads_active, + s_left, s_right, s_left_count, s_right_count, + left, mid, right, + left_count, mid_count, right_count, + epsilon, compact_second_chunk, + s_compaction_list_exc, + is_active_second); + } + else + { + + storeIntervalConverged(s_left, s_right, s_left_count, s_right_count, + left, mid, right, + left_count, mid_count, right_count, + s_compaction_list_exc, compact_second_chunk, + num_threads_active, + is_active_second); + } + } + + // necessary so that compact_second_chunk is up-to-date + __syncthreads(); + + // perform compaction of chunk where second children are stored + // scan of (num_threads_active / 2) elements, thus at most + // (num_threads_active / 4) threads are needed + if (compact_second_chunk > 0) + { + + createIndicesCompaction(s_compaction_list_exc, num_threads_compaction); + + compactIntervals(s_left, s_right, s_left_count, s_right_count, + mid, right, mid_count, right_count, + s_compaction_list, num_threads_active, + is_active_second); + } + + __syncthreads(); + + if (0 == threadIdx.x) + { + + // update number of active threads with result of reduction + num_threads_active += s_compaction_list[num_threads_active]; + + num_threads_compaction = ceilPow2(num_threads_active); + + compact_second_chunk = 0; + } + + __syncthreads(); + + } + + __syncthreads(); + + // write resulting intervals to global mem + // for all threads write if they have been converged to an eigenvalue to + // a separate array + + // at most n valid intervals + if (threadIdx.x < n) + { + + // intervals converged so left and right limit are identical + g_left[threadIdx.x] = s_left[threadIdx.x]; + // left count is sufficient to have global order + g_left_count[threadIdx.x] = s_left_count[threadIdx.x]; + } +} +} // namespace cuda +} // namespace linalg +} // namespace viennacl +#endif // #ifndef _BISECT_KERNEL_SMALL_H_ http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/bisect_util.hpp ---------------------------------------------------------------------- diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/bisect_util.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/bisect_util.hpp new file mode 100755 index 0000000..e2e262c --- /dev/null +++ b/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/bisect_util.hpp @@ -0,0 +1,613 @@ +#ifndef VIENNACL_LINALG_CUDA_BISECT_BISECT_UTIL_HPP_ +#define VIENNACL_LINALG_CUDA_BISECT_BISECT_UTIL_HPP_ + +/* ========================================================================= + Copyright (c) 2010-2016, Institute for Microelectronics, + Institute for Analysis and Scientific Computing, + TU Wien. + Portions of this software are copyright by UChicago Argonne, LLC. + + ----------------- + ViennaCL - The Vienna Computing Library + ----------------- + + Project Head: Karl Rupp [email protected] + + (A list of authors and contributors can be found in the manual) + + License: MIT (X11), see file LICENSE in the base directory +============================================================================= */ + + +/** @file viennacl/linalg/cuda/bisect_kernel_small.hpp + @brief Utility / shared functionality for bisection kernels + + Implementation based on the sample provided with the CUDA 6.0 SDK, for which + the creation of derivative works is allowed by including the following statement: + "This software contains source code provided by NVIDIA Corporation." +*/ + +// includes, project +#include "viennacl/linalg/detail/bisect/config.hpp" +#include "viennacl/linalg/detail/bisect/util.hpp" + +namespace viennacl +{ +namespace linalg +{ +namespace cuda +{ +//////////////////////////////////////////////////////////////////////////////// +//! Compute the next lower power of two of n +//! @param n number for which next higher power of two is seeked +//////////////////////////////////////////////////////////////////////////////// +__device__ +inline int +floorPow2(int n) +{ + + // early out if already power of two + if (0 == (n & (n-1))) + { + return n; + } + + int exp; + frexp((float)n, &exp); + return (1 << (exp - 1)); +} + +//////////////////////////////////////////////////////////////////////////////// +//! Compute the next higher power of two of n +//! @param n number for which next higher power of two is seeked +//////////////////////////////////////////////////////////////////////////////// +__device__ +inline int +ceilPow2(int n) +{ + + // early out if already power of two + if (0 == (n & (n-1))) + { + return n; + } + + int exp; + frexp((float)n, &exp); + return (1 << exp); +} + +//////////////////////////////////////////////////////////////////////////////// +//! Compute midpoint of interval [\a left, \a right] avoiding overflow if +//! possible +//! @param left left / lower limit of interval +//! @param right right / upper limit of interval +//////////////////////////////////////////////////////////////////////////////// +template<typename NumericT> +__device__ +inline NumericT +computeMidpoint(const NumericT left, const NumericT right) +{ + + NumericT mid; + + if (viennacl::linalg::detail::sign_f(left) == viennacl::linalg::detail::sign_f(right)) + { + mid = left + (right - left) * 0.5f; + } + else + { + mid = (left + right) * 0.5f; + } + + return mid; +} + +//////////////////////////////////////////////////////////////////////////////// +//! Check if interval converged and store appropriately +//! @param addr address where to store the information of the interval +//! @param s_left shared memory storage for left interval limits +//! @param s_right shared memory storage for right interval limits +//! @param s_left_count shared memory storage for number of eigenvalues less +//! than left interval limits +//! @param s_right_count shared memory storage for number of eigenvalues less +//! than right interval limits +//! @param left lower limit of interval +//! @param right upper limit of interval +//! @param left_count eigenvalues less than \a left +//! @param right_count eigenvalues less than \a right +//! @param precision desired precision for eigenvalues +//////////////////////////////////////////////////////////////////////////////// +template<class S, class T, class NumericT> +__device__ +void +storeInterval(unsigned int addr, + NumericT *s_left, NumericT *s_right, + T *s_left_count, T *s_right_count, + NumericT left, NumericT right, + S left_count, S right_count, + NumericT precision) +{ + s_left_count[addr] = left_count; + s_right_count[addr] = right_count; + + // check if interval converged + NumericT t0 = abs(right - left); + NumericT t1 = max(abs(left), abs(right)) * precision; + + if (t0 <= max(static_cast<NumericT>(VIENNACL_BISECT_MIN_ABS_INTERVAL), t1)) + { + // compute mid point + NumericT lambda = computeMidpoint(left, right); + + // mark as converged + s_left[addr] = lambda; + s_right[addr] = lambda; + } + else + { + + // store current limits + s_left[addr] = left; + s_right[addr] = right; + } +} + +//////////////////////////////////////////////////////////////////////////////// +//! Compute number of eigenvalues that are smaller than x given a symmetric, +//! real, and tridiagonal matrix +//! @param g_d diagonal elements stored in global memory +//! @param g_s superdiagonal elements stored in global memory +//! @param n size of matrix +//! @param x value for which the number of eigenvalues that are smaller is +//! seeked +//! @param tid thread identified (e.g. threadIdx.x or gtid) +//! @param num_intervals_active number of active intervals / threads that +//! currently process an interval +//! @param s_d scratch space to store diagonal entries of the tridiagonal +//! matrix in shared memory +//! @param s_s scratch space to store superdiagonal entries of the tridiagonal +//! matrix in shared memory +//! @param converged flag if the current thread is already converged (that +//! is count does not have to be computed) +//////////////////////////////////////////////////////////////////////////////// +template<typename NumericT> +__device__ +inline unsigned int +computeNumSmallerEigenvals(const NumericT *g_d, const NumericT *g_s, const unsigned int n, + const NumericT x, + const unsigned int tid, + const unsigned int num_intervals_active, + NumericT *s_d, NumericT *s_s, + unsigned int converged + ) +{ + + NumericT delta = 1.0f; + unsigned int count = 0; + + __syncthreads(); + + // read data into shared memory + if (threadIdx.x < n) + { + s_d[threadIdx.x] = *(g_d + threadIdx.x); + s_s[threadIdx.x] = *(g_s + threadIdx.x - 1); + } + + __syncthreads(); + + // perform loop only for active threads + if ((tid < num_intervals_active) && (0 == converged)) + { + + // perform (optimized) Gaussian elimination to determine the number + // of eigenvalues that are smaller than n + for (unsigned int k = 0; k < n; ++k) + { + delta = s_d[k] - x - (s_s[k] * s_s[k]) / delta; + count += (delta < 0) ? 1 : 0; + } + + } // end if thread currently processing an interval + + return count; +} +//////////////////////////////////////////////////////////////////////////////// +//! Compute number of eigenvalues that are smaller than x given a symmetric, +//! real, and tridiagonal matrix +//! @param g_d diagonal elements stored in global memory +//! @param g_s superdiagonal elements stored in global memory +//! @param n size of matrix +//! @param x value for which the number of eigenvalues that are smaller is +//! seeked +//! @param tid thread identified (e.g. threadIdx.x or gtid) +//! @param num_intervals_active number of active intervals / threads that +//! currently process an interval +//! @param s_d scratch space to store diagonal entries of the tridiagonal +//! matrix in shared memory +//! @param s_s scratch space to store superdiagonal entries of the tridiagonal +//! matrix in shared memory +//! @param converged flag if the current thread is already converged (that +//! is count does not have to be computed) +//////////////////////////////////////////////////////////////////////////////// +template<typename NumericT> +__device__ +inline unsigned int +computeNumSmallerEigenvalsLarge(const NumericT *g_d, const NumericT *g_s, const unsigned int n, + const NumericT x, + const unsigned int tid, + const unsigned int num_intervals_active, + NumericT *s_d, NumericT *s_s, + unsigned int converged + ) +{ + NumericT delta = 1.0f; + unsigned int count = 0; + + unsigned int rem = n; + + // do until whole diagonal and superdiagonal has been loaded and processed + for (unsigned int i = 0; i < n; i += blockDim.x) + { + + __syncthreads(); + + // read new chunk of data into shared memory + if ((i + threadIdx.x) < n) + { + + s_d[threadIdx.x] = *(g_d + i + threadIdx.x); + s_s[threadIdx.x] = *(g_s + i + threadIdx.x - 1); + } + + __syncthreads(); + + + if (tid < num_intervals_active) + { + + // perform (optimized) Gaussian elimination to determine the number + // of eigenvalues that are smaller than n + for (unsigned int k = 0; k < min(rem,blockDim.x); ++k) + { + delta = s_d[k] - x - (s_s[k] * s_s[k]) / delta; + // delta = (abs( delta) < (1.0e-10)) ? -(1.0e-10) : delta; + count += (delta < 0) ? 1 : 0; + } + + } // end if thread currently processing an interval + + rem -= blockDim.x; + } + + return count; +} + +/** @brief Store all non-empty intervals resulting from the subdivision of the interval currently processed by the thread. +* +* @param addr base address for storing intervals +* @param num_threads_active number of threads / intervals in current sweep +* @param s_left shared memory storage for left interval limits +* @param s_right shared memory storage for right interval limits +* @param s_left_count shared memory storage for number of eigenvalues less than left interval limits +* @param s_right_count shared memory storage for number of eigenvalues less than right interval limits +* @param left lower limit of interval +* @param mid midpoint of interval +* @param right upper limit of interval +* @param left_count eigenvalues less than \a left +* @param mid_count eigenvalues less than \a mid +* @param right_count eigenvalues less than \a right +* @param precision desired precision for eigenvalues +* @param compact_second_chunk shared mem flag if second chunk is used and ergo requires compaction +* @param s_compaction_list_exc helper array for stream compaction, s_compaction_list_exc[tid] = 1 when the thread generated two child intervals +* @param is_active_second mark is thread has a second non-empty child interval +*/ +template<class S, class T, class NumericT> +__device__ +void +storeNonEmptyIntervals(unsigned int addr, + const unsigned int num_threads_active, + NumericT *s_left, NumericT *s_right, + T *s_left_count, T *s_right_count, + NumericT left, NumericT mid, NumericT right, + const S left_count, + const S mid_count, + const S right_count, + NumericT precision, + unsigned int &compact_second_chunk, + T *s_compaction_list_exc, + unsigned int &is_active_second) +{ + // check if both child intervals are valid + + if ((left_count != mid_count) && (mid_count != right_count)) + { + + // store the left interval + storeInterval(addr, s_left, s_right, s_left_count, s_right_count, + left, mid, left_count, mid_count, precision); + + // mark that a second interval has been generated, only stored after + // stream compaction of second chunk + is_active_second = 1; + s_compaction_list_exc[threadIdx.x] = 1; + compact_second_chunk = 1; + } + else + { + + // only one non-empty child interval + + // mark that no second child + is_active_second = 0; + s_compaction_list_exc[threadIdx.x] = 0; + + // store the one valid child interval + if (left_count != mid_count) + { + storeInterval(addr, s_left, s_right, s_left_count, s_right_count, + left, mid, left_count, mid_count, precision); + } + else + { + storeInterval(addr, s_left, s_right, s_left_count, s_right_count, + mid, right, mid_count, right_count, precision); + } + + } +} +//////////////////////////////////////////////////////////////////////////////// +//! Create indices for compaction, that is process \a s_compaction_list_exc +//! which is 1 for intervals that generated a second child and 0 otherwise +//! and create for each of the non-zero elements the index where the new +//! interval belongs to in a compact representation of all generated second +//! childs +//! @param s_compaction_list_exc list containing the flags which threads +//! generated two childs +//! @param num_threads_compaction number of threads to employ for compaction +//////////////////////////////////////////////////////////////////////////////// +template<class T> +__device__ +void +createIndicesCompaction(T *s_compaction_list_exc, + unsigned int num_threads_compaction) +{ + + unsigned int offset = 1; + const unsigned int tid = threadIdx.x; + // if(tid == 0) + // printf("num_threads_compaction = %u\n", num_threads_compaction); + + // higher levels of scan tree + for (int d = (num_threads_compaction >> 1); d > 0; d >>= 1) + { + + __syncthreads(); + + if (tid < d) + { + + unsigned int ai = offset*(2*tid+1)-1; + unsigned int bi = offset*(2*tid+2)-1; + + s_compaction_list_exc[bi] = s_compaction_list_exc[bi] + + s_compaction_list_exc[ai]; + } + + offset <<= 1; + } + + // traverse down tree: first down to level 2 across + for (int d = 2; d < num_threads_compaction; d <<= 1) + { + + offset >>= 1; + __syncthreads(); + + if (tid < (d-1)) + { + + unsigned int ai = offset*(tid+1) - 1; + unsigned int bi = ai + (offset >> 1); + + s_compaction_list_exc[bi] = s_compaction_list_exc[bi] + + s_compaction_list_exc[ai]; + } + } + + __syncthreads(); + +} + +/** @brief Perform stream compaction for second child intervals +* +* @param s_left shared memory storage for left interval limits +* @param s_right shared memory storage for right interval limits +* @param s_left_count shared memory storage for number of eigenvalues less than left interval limits +* @param s_right_count shared memory storage for number of eigenvalues less than right interval limits +* @param mid midpoint of current interval (left of new interval) +* @param right upper limit of interval +* @param mid_count eigenvalues less than mid +* @param right_count eigenvalues less than right +* @param s_compaction_list list containing the indices where the data has to be stored +* @param num_threads_active number of active threads / intervals +* @param is_active_second mark is thread has a second non-empty child interval +*/ +template<class T, class NumericT> +__device__ +void +compactIntervals(NumericT *s_left, NumericT *s_right, + T *s_left_count, T *s_right_count, + NumericT mid, NumericT right, + unsigned int mid_count, unsigned int right_count, + T *s_compaction_list, + unsigned int num_threads_active, + unsigned int is_active_second) +{ + const unsigned int tid = threadIdx.x; + + // perform compaction / copy data for all threads where the second + // child is not dead + if ((tid < num_threads_active) && (1 == is_active_second)) + { + unsigned int addr_w = num_threads_active + s_compaction_list[tid]; + s_left[addr_w] = mid; + s_right[addr_w] = right; + s_left_count[addr_w] = mid_count; + s_right_count[addr_w] = right_count; + } +} + +template<class T, class S, class NumericT> +__device__ +void +storeIntervalConverged(NumericT *s_left, NumericT *s_right, + T *s_left_count, T *s_right_count, + NumericT &left, NumericT &mid, NumericT &right, + S &left_count, S &mid_count, S &right_count, + T *s_compaction_list_exc, + unsigned int &compact_second_chunk, + const unsigned int num_threads_active, + unsigned int &is_active_second) +{ + const unsigned int tid = threadIdx.x; + const unsigned int multiplicity = right_count - left_count; + // check multiplicity of eigenvalue + if (1 == multiplicity) + { + + // just re-store intervals, simple eigenvalue + s_left[tid] = left; + s_right[tid] = right; + s_left_count[tid] = left_count; + s_right_count[tid] = right_count; + + + // mark that no second child / clear + is_active_second = 0; + s_compaction_list_exc[tid] = 0; + } + else + { + + // number of eigenvalues after the split less than mid + mid_count = left_count + (multiplicity >> 1); + + // store left interval + s_left[tid] = left; + s_right[tid] = right; + s_left_count[tid] = left_count; + s_right_count[tid] = mid_count; + mid = left; + + // mark that second child interval exists + is_active_second = 1; + s_compaction_list_exc[tid] = 1; + compact_second_chunk = 1; + } +} + +/** @brief Subdivide interval if active and not already converged. +* +* @param tid id of thread +* @param s_left shared memory storage for left interval limits +* @param s_right shared memory storage for right interval limits +* @param s_left_count shared memory storage for number of eigenvalues less than left interval limits +* @param s_right_count shared memory storage for number of eigenvalues less than right interval limits +* @param num_threads_active number of active threads in warp +* @param left lower limit of interval +* @param right upper limit of interval +* @param left_count eigenvalues less than \a left +* @param right_count eigenvalues less than \a right +* @param mid median of interval +* @param all_threads_converged shared memory flag if all threads are +*/ +template<class T, class NumericT> +__device__ +void +subdivideActiveIntervalMulti(const unsigned int tid, + NumericT *s_left, NumericT *s_right, + T *s_left_count, T *s_right_count, + const unsigned int num_threads_active, + NumericT &left, NumericT &right, + unsigned int &left_count, unsigned int &right_count, + NumericT &mid, unsigned int &all_threads_converged) +{ + // for all active threads + if (tid < num_threads_active) + { + + left = s_left[tid]; + right = s_right[tid]; + left_count = s_left_count[tid]; + right_count = s_right_count[tid]; + + // check if thread already converged + if (left != right) + { + + mid = computeMidpoint(left, right); + all_threads_converged = 0; + } + else if ((right_count - left_count) > 1) + { + // mark as not converged if multiple eigenvalues enclosed + // duplicate interval in storeIntervalsConverged() + all_threads_converged = 0; + } + + } // end for all active threads +} + + +/** @brief Subdivide interval if active and not already converged. +* +* @param tid id of thread +* @param s_left shared memory storage for left interval limits +* @param s_right shared memory storage for right interval limits +* @param s_left_count shared memory storage for number of eigenvalues less than left interval limits +* @param s_right_count shared memory storage for number of eigenvalues less than right interval limits +* @param num_threads_active number of active threads in warp +* @param left lower limit of interval +* @param right upper limit of interval +* @param left_count eigenvalues less than \a left +* @param right_count eigenvalues less than \a right +* @param mid median of interval +* @param all_threads_converged shared memory flag if all threads are +*/ +template<class T, class NumericT> +__device__ +void +subdivideActiveInterval(const unsigned int tid, + NumericT *s_left, NumericT *s_right, + T *s_left_count, T *s_right_count, + const unsigned int num_threads_active, + NumericT &left, NumericT &right, + unsigned int &left_count, unsigned int &right_count, + NumericT &mid, unsigned int &all_threads_converged) +{ + // for all active threads + if (tid < num_threads_active) + { + + left = s_left[tid]; + right = s_right[tid]; + left_count = s_left_count[tid]; + right_count = s_right_count[tid]; + + // check if thread already converged + if (left != right) + { + + mid = computeMidpoint(left, right); + all_threads_converged = 0; + } + } // end for all active threads +} +} +} +} + +#endif // #ifndef VIENNACL_LINALG_DETAIL_BISECT_UTIL_HPP_ + http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/common.hpp ---------------------------------------------------------------------- diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/common.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/common.hpp new file mode 100644 index 0000000..3622b89 --- /dev/null +++ b/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/common.hpp @@ -0,0 +1,250 @@ +#ifndef VIENNACL_LINALG_CUDA_COMMON_HPP_ +#define VIENNACL_LINALG_CUDA_COMMON_HPP_ + +/* ========================================================================= + Copyright (c) 2010-2016, Institute for Microelectronics, + Institute for Analysis and Scientific Computing, + TU Wien. + Portions of this software are copyright by UChicago Argonne, LLC. + + ----------------- + ViennaCL - The Vienna Computing Library + ----------------- + + Project Head: Karl Rupp [email protected] + + (A list of authors and contributors can be found in the manual) + + License: MIT (X11), see file LICENSE in the base directory +============================================================================= */ + +/** @file viennacl/linalg/cuda/common.hpp + @brief Common routines for CUDA execution +*/ + +#include <sstream> +#include <cuda_runtime.h> +#include "viennacl/backend/cuda.hpp" +#include "viennacl/traits/handle.hpp" + +#define VIENNACL_CUDA_LAST_ERROR_CHECK(message) detail::cuda_last_error_check (message, __FILE__, __LINE__) + +namespace viennacl +{ + +////// scalar + +/** @brief Convenience helper function for extracting the CUDA handle from a ViennaCL scalar. Non-const version. */ +template<typename NumericT> +NumericT * cuda_arg(scalar<NumericT> & obj) +{ + return reinterpret_cast<NumericT *>(viennacl::traits::handle(obj).cuda_handle().get()); +} + +/** @brief Convenience helper function for extracting the CUDA handle from a ViennaCL scalar. Const version. */ +template<typename NumericT> +const NumericT * cuda_arg(scalar<NumericT> const & obj) +{ + return reinterpret_cast<const NumericT *>(viennacl::traits::handle(obj).cuda_handle().get()); +} + + +////// vector_base + +/** @brief Convenience helper function for extracting the CUDA handle from a ViennaCL vector (through the base class vector_base) with implicit return type deduction. Non-const version. */ +template<typename NumericT> +NumericT * cuda_arg(vector_base<NumericT> & obj) +{ + return reinterpret_cast<NumericT *>(viennacl::traits::handle(obj).cuda_handle().get()); +} + +/** @brief Convenience helper function for extracting the CUDA handle from a ViennaCL vector (through the base class vector_base) with implicit return type deduction. Const version. */ +template<typename NumericT> +const NumericT * cuda_arg(vector_base<NumericT> const & obj) +{ + return reinterpret_cast<const NumericT *>(viennacl::traits::handle(obj).cuda_handle().get()); +} + +/** @brief Convenience helper function for extracting the CUDA handle from a ViennaCL vector (through the base class vector_base). Return type needs to be explicitly provided as first template argument. Non-const version. */ +template<typename ReturnT, typename NumericT> +ReturnT * cuda_arg(vector_base<NumericT> & obj) +{ + return reinterpret_cast<ReturnT *>(viennacl::traits::handle(obj).cuda_handle().get()); +} + +/** @brief Convenience helper function for extracting the CUDA handle from a ViennaCL vector (through the base class vector_base). Return type needs to be explicitly provided as first template argument. Const version. */ +template<typename ReturnT, typename NumericT> +const ReturnT * cuda_arg(vector_base<NumericT> const & obj) +{ + return reinterpret_cast<const ReturnT *>(viennacl::traits::handle(obj).cuda_handle().get()); +} + + +////// matrix_base + +/** @brief Convenience helper function for extracting the CUDA handle from a ViennaCL matrix (through the base class matrix_base). Non-const version. */ +template<typename NumericT> +NumericT * cuda_arg(matrix_base<NumericT> & obj) +{ + return reinterpret_cast<NumericT *>(viennacl::traits::handle(obj).cuda_handle().get()); +} + +/** @brief Convenience helper function for extracting the CUDA handle from a ViennaCL matrix (through the base class matrix_base). Const version. */ +template<typename NumericT> +const NumericT * cuda_arg(matrix_base<NumericT> const & obj) +{ + return reinterpret_cast<const NumericT *>(viennacl::traits::handle(obj).cuda_handle().get()); +} + + + +////// mem_handle + + +/** @brief Convenience helper function for extracting the CUDA handle from a generic memory handle. Non-const version. */ +template<typename ReturnT> +ReturnT * cuda_arg(viennacl::backend::mem_handle & h) +{ + return reinterpret_cast<ReturnT *>(h.cuda_handle().get()); +} + +/** @brief Convenience helper function for extracting the CUDA handle from a generic memory handle. Const-version. */ +template<typename ReturnT> +ReturnT const * cuda_arg(viennacl::backend::mem_handle const & h) +{ + return reinterpret_cast<const ReturnT *>(h.cuda_handle().get()); +} + +/** \cond **/ +template<typename ReturnT> +ReturnT * cuda_arg(viennacl::backend::mem_handle::cuda_handle_type & h) +{ + return reinterpret_cast<ReturnT *>(h.get()); +} + +template<typename ReturnT> +ReturnT const * cuda_arg(viennacl::backend::mem_handle::cuda_handle_type const & h) +{ + return reinterpret_cast<const ReturnT *>(h.get()); +} + +inline unsigned int cuda_arg(unsigned int val) { return val; } + +template<typename NumericT> char cuda_arg(char val) { return val; } +template<typename NumericT> unsigned char cuda_arg(unsigned char val) { return val; } + +template<typename NumericT> short cuda_arg(short val) { return val; } +template<typename NumericT> unsigned short cuda_arg(unsigned short val) { return val; } + +template<typename NumericT> int cuda_arg(int val) { return val; } +template<typename NumericT> unsigned int cuda_arg(unsigned int val) { return val; } + +template<typename NumericT> long cuda_arg(long val) { return val; } +template<typename NumericT> unsigned long cuda_arg(unsigned long val) { return val; } + +template<typename NumericT> float cuda_arg(float val) { return val; } +template<typename NumericT> double cuda_arg(double val) { return val; } + +/** \endcond */ + + +namespace linalg +{ +namespace cuda +{ + + +namespace detail +{ + +inline unsigned int make_options(vcl_size_t length, bool reciprocal, bool flip_sign) +{ + return static_cast<unsigned int>( ((length > 1) ? (static_cast<unsigned int>(length) << 2) : 0) + (reciprocal ? 2 : 0) + (flip_sign ? 1 : 0) ); +} + +inline void cuda_last_error_check(const char * message, const char * file, const int line ) +{ + cudaError_t error_code = cudaGetLastError(); + + if (cudaSuccess != error_code) + { + std::stringstream ss; + ss << file << "(" << line << "): " << ": getLastCudaError() CUDA error " << error_code << ": " << cudaGetErrorString( error_code ) << " @ " << message << std::endl; + throw viennacl::backend::cuda::cuda_exception(ss.str(), error_code); + } +} + +template<typename NumericT> +struct type_to_type2; + +template<> +struct type_to_type2<float> { typedef float2 type; }; + +template<> +struct type_to_type2<double> { typedef double2 type; }; + + +template<typename NumericT, typename OtherT> +typename viennacl::backend::mem_handle::cuda_handle_type & arg_reference(viennacl::scalar<NumericT> & s, OtherT) { return s.handle().cuda_handle(); } + +template<typename NumericT, typename OtherT> +typename viennacl::backend::mem_handle::cuda_handle_type const & arg_reference(viennacl::scalar<NumericT> const & s, OtherT) { return s.handle().cuda_handle(); } + +// all other cases where T is not a ViennaCL scalar +template<typename ArgT> +typename viennacl::enable_if< viennacl::is_cpu_scalar<ArgT>::value, + char const &>::type +arg_reference(ArgT, char const & val) { return val; } + +template<typename ArgT> +typename viennacl::enable_if< viennacl::is_cpu_scalar<ArgT>::value, + unsigned char const &>::type +arg_reference(ArgT, unsigned char const & val) { return val; } + +template<typename ArgT> +typename viennacl::enable_if< viennacl::is_cpu_scalar<ArgT>::value, + short const &>::type +arg_reference(ArgT, short const & val) { return val; } + +template<typename ArgT> +typename viennacl::enable_if< viennacl::is_cpu_scalar<ArgT>::value, + unsigned short const &>::type +arg_reference(ArgT, unsigned short const & val) { return val; } + +template<typename ArgT> +typename viennacl::enable_if< viennacl::is_cpu_scalar<ArgT>::value, + int const &>::type +arg_reference(ArgT, int const & val) { return val; } + +template<typename ArgT> +typename viennacl::enable_if< viennacl::is_cpu_scalar<ArgT>::value, + unsigned int const &>::type +arg_reference(ArgT, unsigned int const & val) { return val; } + +template<typename ArgT> +typename viennacl::enable_if< viennacl::is_cpu_scalar<ArgT>::value, + long const &>::type +arg_reference(ArgT, long const & val) { return val; } + +template<typename ArgT> +typename viennacl::enable_if< viennacl::is_cpu_scalar<ArgT>::value, + unsigned long const &>::type +arg_reference(ArgT, unsigned long const & val) { return val; } + +template<typename ArgT> +typename viennacl::enable_if< viennacl::is_cpu_scalar<ArgT>::value, + float const &>::type +arg_reference(ArgT, float const & val) { return val; } + +template<typename ArgT> +typename viennacl::enable_if< viennacl::is_cpu_scalar<ArgT>::value, + double const &>::type +arg_reference(ArgT, double const & val) { return val; } + +} //namespace detail +} //namespace cuda +} //namespace linalg +} //namespace viennacl + + +#endif http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/direct_solve.hpp ---------------------------------------------------------------------- diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/direct_solve.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/direct_solve.hpp new file mode 100644 index 0000000..ae70f9a --- /dev/null +++ b/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/direct_solve.hpp @@ -0,0 +1,412 @@ +#ifndef VIENNACL_LINALG_CUDA_DIRECT_SOLVE_HPP +#define VIENNACL_LINALG_CUDA_DIRECT_SOLVE_HPP + +/* ========================================================================= + Copyright (c) 2010-2016, Institute for Microelectronics, + Institute for Analysis and Scientific Computing, + TU Wien. + Portions of this software are copyright by UChicago Argonne, LLC. + + ----------------- + ViennaCL - The Vienna Computing Library + ----------------- + + Project Head: Karl Rupp [email protected] + + (A list of authors and contributors can be found in the manual) + + License: MIT (X11), see file LICENSE in the base directory +============================================================================= */ + +/** @file viennacl/linalg/cuda/direct_solve.hpp + @brief Implementations of dense direct solvers using CUDA are found here. +*/ + +#include "viennacl/forwards.h" +#include "viennacl/vector.hpp" +#include "viennacl/matrix.hpp" + + +#include "viennacl/linalg/cuda/common.hpp" + + +namespace viennacl +{ +namespace linalg +{ +namespace cuda +{ + +template<typename NumericT> +__global__ void matrix_matrix_upper_solve_kernel( + const NumericT * A, + unsigned int A_start1, unsigned int A_start2, + unsigned int A_inc1, unsigned int A_inc2, + unsigned int A_size1, unsigned int A_size2, + unsigned int A_internal_size1, unsigned int A_internal_size2, + bool row_major_A, + + NumericT * B, + unsigned int B_start1, unsigned int B_start2, + unsigned int B_inc1, unsigned int B_inc2, + unsigned int B_size1, unsigned int B_size2, + unsigned int B_internal_size1, unsigned int B_internal_size2, + bool row_major_B, + + bool unit_diagonal) +{ + NumericT temp; + NumericT entry_A; + + for (unsigned int row_cnt = 0; row_cnt < A_size1; ++row_cnt) + { + unsigned int row = A_size1 - 1 - row_cnt; + + if (!unit_diagonal) + { + __syncthreads(); + + if (threadIdx.x == 0) + { + if (row_major_B) + B[(row * B_inc1 + B_start1) * B_internal_size2 + (blockIdx.x * B_inc2 + B_start2)] /= (row_major_A) ? A[(row * A_inc1 + A_start1) * A_internal_size2 + (row * A_inc2 + A_start2)] + : A[(row * A_inc1 + A_start1) + (row * A_inc2 + A_start2)*A_internal_size1]; + else //if (!row_major_B) + B[(row * B_inc1 + B_start1) + (blockIdx.x * B_inc2 + B_start2) * B_internal_size1] /= (row_major_A) ? A[(row * A_inc1 + A_start1) * A_internal_size2 + (row * A_inc2 + A_start2)] + : A[(row * A_inc1 + A_start1) + (row * A_inc2 + A_start2)*A_internal_size1]; + } + } + + __syncthreads(); + + if (row_major_B) + temp = B[(row * B_inc1 + B_start1) * B_internal_size2 + (blockIdx.x * B_inc2 + B_start2)]; + else //if (!row_major_B) + temp = B[(row * B_inc1 + B_start1) + (blockIdx.x * B_inc2 + B_start2) * B_internal_size1]; + + //eliminate column of op(A) with index 'row' in parallel: " << std::endl; + for (unsigned int elim = threadIdx.x; elim < row; elim += blockDim.x) + { + if (row_major_A) + entry_A = A[(elim * A_inc1 + A_start1) * A_internal_size2 + (row * A_inc2 + A_start2)]; + else //if (!row_major_A) + entry_A = A[(elim * A_inc1 + A_start1) + (row * A_inc2 + A_start2) * A_internal_size1]; + + if (row_major_B) + B[(elim * B_inc1 + B_start1) * B_internal_size2 + (blockIdx.x * B_inc2 + B_start2)] -= temp * entry_A; + else //if (!row_major_B) + B[(elim * B_inc1 + B_start1) + (blockIdx.x * B_inc2 + B_start2) * B_internal_size1] -= temp * entry_A; + + } + } +} + + + +template<typename NumericT> +__global__ void matrix_matrix_lower_solve_kernel( + const NumericT * A, + unsigned int A_start1, unsigned int A_start2, + unsigned int A_inc1, unsigned int A_inc2, + unsigned int A_size1, unsigned int A_size2, + unsigned int A_internal_size1, unsigned int A_internal_size2, + bool row_major_A, + + NumericT * B, + unsigned int B_start1, unsigned int B_start2, + unsigned int B_inc1, unsigned int B_inc2, + unsigned int B_size1, unsigned int B_size2, + unsigned int B_internal_size1, unsigned int B_internal_size2, + bool row_major_B, + + bool unit_diagonal) +{ + NumericT temp; + NumericT entry_A; + + for (unsigned int row = 0; row < A_size1; ++row) + { + + if (!unit_diagonal) + { + __syncthreads(); + + if (threadIdx.x == 0) + { + if (row_major_B) + B[(row * B_inc1 + B_start1) * B_internal_size2 + (blockIdx.x * B_inc2 + B_start2)] /= (row_major_A) ? A[(row * A_inc1 + A_start1) * A_internal_size2 + (row * A_inc2 + A_start2)] + : A[(row * A_inc1 + A_start1) + (row * A_inc2 + A_start2)*A_internal_size1]; + else //if (!row_major_B) + B[(row * B_inc1 + B_start1) + (blockIdx.x * B_inc2 + B_start2) * B_internal_size1] /= (row_major_A) ? A[(row * A_inc1 + A_start1) * A_internal_size2 + (row * A_inc2 + A_start2)] + : A[(row * A_inc1 + A_start1) + (row * A_inc2 + A_start2)*A_internal_size1]; + } + } + + __syncthreads(); + + if (row_major_B) + temp = B[(row * B_inc1 + B_start1) * B_internal_size2 + (blockIdx.x * B_inc2 + B_start2)]; + else //if (!row_major_B) + temp = B[(row * B_inc1 + B_start1) + (blockIdx.x * B_inc2 + B_start2) * B_internal_size1]; + + //eliminate column of op(A) with index 'row' in parallel: " << std::endl; + for (unsigned int elim = row + threadIdx.x + 1; elim < A_size1; elim += blockDim.x) + { + if (row_major_A) + entry_A = A[(elim * A_inc1 + A_start1) * A_internal_size2 + (row * A_inc2 + A_start2)]; + else //if (!row_major_A) + entry_A = A[(elim * A_inc1 + A_start1) + (row * A_inc2 + A_start2) * A_internal_size1]; + + if (row_major_B) + B[(elim * B_inc1 + B_start1) * B_internal_size2 + (blockIdx.x * B_inc2 + B_start2)] -= temp * entry_A; + else //if (!row_major_B) + B[(elim * B_inc1 + B_start1) + (blockIdx.x * B_inc2 + B_start2) * B_internal_size1] -= temp * entry_A; + + } + } +} + + + + + + +namespace detail +{ + template<typename TagT> + bool is_unit_solve(TagT const & tag) { return false; } + + inline bool is_unit_solve(viennacl::linalg::unit_lower_tag) { return true; } + inline bool is_unit_solve(viennacl::linalg::unit_upper_tag) { return true; } + + template<typename TagT> + bool is_upper_solve(TagT const & tag) { return false; } + + inline bool is_upper_solve(viennacl::linalg::upper_tag) { return true; } + inline bool is_upper_solve(viennacl::linalg::unit_upper_tag) { return true; } + + template<typename Matrix1T, typename Matrix2T, typename SolverTagT> + void inplace_solve_impl(Matrix1T const & A, + Matrix2T & B, + SolverTagT const & tag) + { + typedef typename viennacl::result_of::cpu_value_type<Matrix1T>::type value_type; + + dim3 threads(128); + dim3 grid(B.size2()); + + if (is_upper_solve(tag)) + { + matrix_matrix_upper_solve_kernel<<<grid,threads>>>(viennacl::cuda_arg(A), + static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)), + static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)), + static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)), + static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)), + bool(A.row_major()), + + viennacl::cuda_arg(B), + static_cast<unsigned int>(viennacl::traits::start1(B)), static_cast<unsigned int>(viennacl::traits::start2(B)), + static_cast<unsigned int>(viennacl::traits::stride1(B)), static_cast<unsigned int>(viennacl::traits::stride2(B)), + static_cast<unsigned int>(viennacl::traits::size1(B)), static_cast<unsigned int>(viennacl::traits::size2(B)), + static_cast<unsigned int>(viennacl::traits::internal_size1(B)), static_cast<unsigned int>(viennacl::traits::internal_size2(B)), + bool(B.row_major()), + + is_unit_solve(tag) + ); + } + else + { + matrix_matrix_lower_solve_kernel<<<grid,threads>>>(viennacl::cuda_arg(A), + static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)), + static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)), + static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)), + static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)), + bool(A.row_major()), + + viennacl::cuda_arg(B), + static_cast<unsigned int>(viennacl::traits::start1(B)), static_cast<unsigned int>(viennacl::traits::start2(B)), + static_cast<unsigned int>(viennacl::traits::stride1(B)), static_cast<unsigned int>(viennacl::traits::stride2(B)), + static_cast<unsigned int>(viennacl::traits::size1(B)), static_cast<unsigned int>(viennacl::traits::size2(B)), + static_cast<unsigned int>(viennacl::traits::internal_size1(B)), static_cast<unsigned int>(viennacl::traits::internal_size2(B)), + bool(B.row_major()), + + is_unit_solve(tag) + ); + } + + } +} + + +// +// Note: By convention, all size checks are performed in the calling frontend. No need to double-check here. +// + +////////////////// triangular solver ////////////////////////////////////// +/** @brief Direct inplace solver for triangular systems with multiple right hand sides, i.e. A \ B (MATLAB notation). +* +* @param A The system matrix +* @param B The matrix of row vectors, where the solution is directly written to +* @param tag Solver tag for identifying the respective triangular solver +*/ +template<typename NumericT, typename SolverTagT> +void inplace_solve(matrix_base<NumericT> const & A, + matrix_base<NumericT> & B, + SolverTagT tag) +{ + detail::inplace_solve_impl(A, B, tag); +} + + +// +// Solve on vector +// + +template<typename NumericT> +__global__ void triangular_substitute_inplace_row_kernel( + NumericT const * A, + unsigned int A_start1, unsigned int A_start2, + unsigned int A_inc1, unsigned int A_inc2, + unsigned int A_size1, unsigned int A_size2, + unsigned int A_internal_size1, unsigned int A_internal_size2, + NumericT * v, + unsigned int v_start, + unsigned int v_inc, + unsigned int v_size, + + unsigned int options) +{ + NumericT temp; + unsigned int unit_diagonal_flag = (options & (1 << 0)); + + unsigned int is_lower_solve = (options & (1 << 2)); + unsigned int row; + for (unsigned int rows_processed = 0; rows_processed < A_size1; ++rows_processed) //Note: A required to be square + { + row = is_lower_solve ? rows_processed : ((A_size1 - rows_processed) - 1); + if (!unit_diagonal_flag) + { + __syncthreads(); + if (threadIdx.x == 0) + v[row * v_inc + v_start] /= A[(row * A_inc1 + A_start1) * A_internal_size2 + (row * A_inc2 + A_start2)]; + } + + __syncthreads(); + + temp = v[row * v_inc + v_start]; + + for (int elim = (is_lower_solve ? (row + threadIdx.x + 1) : threadIdx.x); + elim < (is_lower_solve ? A_size1 : row); + elim += blockDim.x) + v[elim * v_inc + v_start] -= temp * A[(elim * A_inc1 + A_start1) * A_internal_size2 + (row * A_inc2 + A_start2)]; + } +} + + +template<typename NumericT> +__global__ void triangular_substitute_inplace_col_kernel( + NumericT const * A, + unsigned int A_start1, unsigned int A_start2, + unsigned int A_inc1, unsigned int A_inc2, + unsigned int A_size1, unsigned int A_size2, + unsigned int A_internal_size1, unsigned int A_internal_size2, + NumericT * v, + unsigned int v_start, + unsigned int v_inc, + unsigned int v_size, + unsigned int options) +{ + NumericT temp; + unsigned int unit_diagonal_flag = (options & (1 << 0)); + + unsigned int is_lower_solve = (options & (1 << 2)); + unsigned int row; + for (unsigned int rows_processed = 0; rows_processed < A_size1; ++rows_processed) //Note: A required to be square + { + row = is_lower_solve ? rows_processed : ((A_size1 - rows_processed) - 1); + if (!unit_diagonal_flag) + { + __syncthreads(); + if (threadIdx.x == 0) + v[row * v_inc + v_start] /= A[(row * A_inc1 + A_start1) + (row * A_inc2 + A_start2) * A_internal_size1]; + } + + __syncthreads(); + + temp = v[row * v_inc + v_start]; + + for (int elim = (is_lower_solve ? (row + threadIdx.x + 1) : threadIdx.x); + elim < (is_lower_solve ? A_size1 : row); + elim += blockDim.x) + v[elim * v_inc + v_start] -= temp * A[(elim * A_inc1 + A_start1) + (row * A_inc2 + A_start2) * A_internal_size1]; + } +} + + +namespace detail +{ + inline unsigned int get_option_for_solver_tag(viennacl::linalg::upper_tag) { return 0; } + inline unsigned int get_option_for_solver_tag(viennacl::linalg::unit_upper_tag) { return (1 << 0); } + inline unsigned int get_option_for_solver_tag(viennacl::linalg::lower_tag) { return (1 << 2); } + inline unsigned int get_option_for_solver_tag(viennacl::linalg::unit_lower_tag) { return (1 << 2) | (1 << 0); } + + template<typename MatrixT, typename VectorT> + void inplace_solve_vector_impl(MatrixT const & mat, + VectorT & vec, + unsigned int options) + { + typedef typename viennacl::result_of::cpu_value_type<MatrixT>::type value_type; + + if (mat.row_major()) + { + triangular_substitute_inplace_row_kernel<<<1, 128>>>(viennacl::cuda_arg(mat), + static_cast<unsigned int>(viennacl::traits::start1(mat)), static_cast<unsigned int>(viennacl::traits::start2(mat)), + static_cast<unsigned int>(viennacl::traits::stride1(mat)), static_cast<unsigned int>(viennacl::traits::stride2(mat)), + static_cast<unsigned int>(viennacl::traits::size1(mat)), static_cast<unsigned int>(viennacl::traits::size2(mat)), + static_cast<unsigned int>(viennacl::traits::internal_size1(mat)), static_cast<unsigned int>(viennacl::traits::internal_size2(mat)), + viennacl::cuda_arg(vec), + static_cast<unsigned int>(viennacl::traits::start(vec)), + static_cast<unsigned int>(viennacl::traits::stride(vec)), + static_cast<unsigned int>(viennacl::traits::size(vec)), + options + ); + } + else + { + triangular_substitute_inplace_col_kernel<<<1, 128>>>(viennacl::cuda_arg(mat), + static_cast<unsigned int>(viennacl::traits::start1(mat)), static_cast<unsigned int>(viennacl::traits::start2(mat)), + static_cast<unsigned int>(viennacl::traits::stride1(mat)), static_cast<unsigned int>(viennacl::traits::stride2(mat)), + static_cast<unsigned int>(viennacl::traits::size1(mat)), static_cast<unsigned int>(viennacl::traits::size2(mat)), + static_cast<unsigned int>(viennacl::traits::internal_size1(mat)), static_cast<unsigned int>(viennacl::traits::internal_size2(mat)), + viennacl::cuda_arg(vec), + static_cast<unsigned int>(viennacl::traits::start(vec)), + static_cast<unsigned int>(viennacl::traits::stride(vec)), + static_cast<unsigned int>(viennacl::traits::size(vec)), + options + ); + } + } + +} + +/** @brief Direct inplace solver for dense triangular systems (non-transposed version) +* +* @param mat The system matrix proxy +* @param vec The load vector, where the solution is directly written to +*/ +template<typename NumericT, typename SolverTagT> +void inplace_solve(matrix_base<NumericT> const & mat, + vector_base<NumericT> & vec, + SolverTagT) +{ + unsigned int options = detail::get_option_for_solver_tag(SolverTagT()); + + detail::inplace_solve_vector_impl(mat, vec, options); +} + + +} +} +} + +#endif
