http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/bisect_kernel_large_onei.hpp
----------------------------------------------------------------------
diff --git 
a/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/bisect_kernel_large_onei.hpp
 
b/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/bisect_kernel_large_onei.hpp
new file mode 100755
index 0000000..960f5c2
--- /dev/null
+++ 
b/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/bisect_kernel_large_onei.hpp
@@ -0,0 +1,180 @@
+#ifndef VIENNACL_LINALG_CUDA_BISECT_KERNEL_LARGE_ONEI_HPP_
+#define VIENNACL_LINALG_CUDA_BISECT_KERNEL_LARGE_ONEI_HPP_
+
+/* =========================================================================
+   Copyright (c) 2010-2016, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   [email protected]
+
+   (A list of authors and contributors can be found in the manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= 
*/
+
+
+/** @file viennacl/linalg/cuda/bisect_kernel_large_onei.hpp
+    @brief Determine eigenvalues for large matrices for intervals that 
contained after the first step one eigenvalue
+
+    Implementation based on the sample provided with the CUDA 6.0 SDK, for 
which
+    the creation of derivative works is allowed by including the following 
statement:
+    "This software contains source code provided by NVIDIA Corporation."
+*/
+
+// includes, project
+#include "viennacl/linalg/detail/bisect/config.hpp"
+#include "viennacl/linalg/detail/bisect/util.hpp"
+// additional kernel
+#include "viennacl/linalg/cuda/bisect_util.hpp"
+
+namespace viennacl
+{
+namespace linalg
+{
+namespace cuda
+{
+////////////////////////////////////////////////////////////////////////////////
+//! Determine eigenvalues for large matrices for intervals that after
+//! the first step contained one eigenvalue
+//! @param  g_d  diagonal elements of symmetric, tridiagonal matrix
+//! @param  g_s  superdiagonal elements of symmetric, tridiagonal matrix
+//! @param  n    matrix size
+//! @param  num_intervals  total number of intervals containing one eigenvalue
+//!                         after the first step
+//! @param g_left  left interval limits
+//! @param g_right  right interval limits
+//! @param g_pos  index of interval / number of intervals that are smaller than
+//!               right interval limit
+//! @param  precision  desired precision of eigenvalues
+////////////////////////////////////////////////////////////////////////////////
+template<typename NumericT>
+__global__
+void
+bisectKernelLarge_OneIntervals(const NumericT *g_d, const NumericT *g_s, const 
unsigned int n,
+                               unsigned int num_intervals,
+                               NumericT *g_left, NumericT *g_right,
+                               unsigned int *g_pos,
+                               NumericT  precision)
+{
+
+  const unsigned int gtid = (blockDim.x * blockIdx.x) + threadIdx.x;
+
+  __shared__  NumericT  s_left_scratch[VIENNACL_BISECT_MAX_THREADS_BLOCK];
+  __shared__  NumericT  s_right_scratch[VIENNACL_BISECT_MAX_THREADS_BLOCK];
+
+  // active interval of thread
+  // left and right limit of current interval
+  NumericT left, right;
+  // number of threads smaller than the right limit (also corresponds to the
+  // global index of the eigenvalues contained in the active interval)
+  unsigned int right_count;
+  // flag if current thread converged
+  unsigned int converged = 0;
+  // midpoint when current interval is subdivided
+  NumericT mid = 0.0f;
+  // number of eigenvalues less than mid
+  unsigned int mid_count = 0;
+
+  // read data from global memory
+  if (gtid < num_intervals)
+  {
+    left = g_left[gtid];
+    right = g_right[gtid];
+    right_count = g_pos[gtid];
+  }
+
+
+  // flag to determine if all threads converged to eigenvalue
+  __shared__  unsigned int  converged_all_threads;
+
+  // initialized shared flag
+  if (0 == threadIdx.x)
+  {
+    converged_all_threads = 0;
+  }
+
+  __syncthreads();
+
+  // process until all threads converged to an eigenvalue
+  while (true)
+  {
+
+    converged_all_threads = 1;
+
+    // update midpoint for all active threads
+    if ((gtid < num_intervals) && (0 == converged))
+    {
+      mid = computeMidpoint(left, right);
+    }
+
+    // find number of eigenvalues that are smaller than midpoint
+    mid_count = computeNumSmallerEigenvalsLarge(g_d, g_s, n,
+                                                mid, gtid, num_intervals,
+                                                s_left_scratch,
+                                                s_right_scratch,
+                                                converged);
+
+    __syncthreads();
+
+    // for all active threads
+    if ((gtid < num_intervals) && (0 == converged))
+    {
+
+      // update intervals -- always one child interval survives
+      if (right_count == mid_count)
+      {
+        right = mid;
+      }
+      else
+      {
+        left = mid;
+      }
+
+      // check for convergence
+      NumericT t0 = right - left;
+      NumericT t1 = max(abs(right), abs(left)) * precision;
+
+      if (t0 < min(precision, t1))
+      {
+        NumericT lambda = computeMidpoint(left, right);
+        left = lambda;
+        right = lambda;
+
+        converged = 1;
+      }
+      else
+      {
+        converged_all_threads = 0;
+      }
+    }
+
+    __syncthreads();
+
+    if (1 == converged_all_threads)
+    {
+      break;
+    }
+
+    __syncthreads();
+  }
+
+  // write data back to global memory
+  __syncthreads();
+
+  if (gtid < num_intervals)
+  {
+      // intervals converged so left and right interval limit are both 
identical
+      // and identical to the eigenvalue
+      g_left[gtid] = left;
+  }
+}
+} // namespace cuda
+} // namespace linalg
+} // namespace viennacl
+#endif // #ifndef VIENNACL_LINALG_CUDA_BISECT_KERNEL_LARGE_ONEI_HPP_

http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/bisect_kernel_small.hpp
----------------------------------------------------------------------
diff --git 
a/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/bisect_kernel_small.hpp 
b/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/bisect_kernel_small.hpp
new file mode 100755
index 0000000..310b381
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/bisect_kernel_small.hpp
@@ -0,0 +1,261 @@
+#ifndef VIENNACL_LINALG_CUDA_BISECT_KERNEL_SMALL_HPP_
+#define VIENNACL_LINALG_CUDA_BISECT_KERNEL_SMALL_HPP_
+
+/* =========================================================================
+   Copyright (c) 2010-2016, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   [email protected]
+
+   (A list of authors and contributors can be found in the manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= 
*/
+
+
+/** @file viennacl/linalg/cuda/bisect_kernel_small.hpp
+    @brief Determine eigenvalues for small symmetric, tridiagonal matrix
+
+    Implementation based on the sample provided with the CUDA 6.0 SDK, for 
which
+    the creation of derivative works is allowed by including the following 
statement:
+    "This software contains source code provided by NVIDIA Corporation."
+*/
+
+// includes, project
+#include "viennacl/linalg/detail/bisect/config.hpp"
+#include "viennacl/linalg/detail/bisect/util.hpp"
+// additional kernel
+#include "viennacl/linalg/cuda/bisect_util.hpp"
+
+namespace viennacl
+{
+namespace linalg
+{
+namespace cuda
+{
+
+/** @brief Bisection to find eigenvalues of a real, symmetric, and tridiagonal 
matrix
+*
+* @param  g_d  diagonal elements in global memory
+* @param  g_s  superdiagonal elements in global elements (stored so that the 
element *(g_s - 1) can be accessed an equals 0
+* @param  n    size of matrix
+* @param  g_left         helper array
+* @param  g_right        helper array
+* @param  g_left_count   helper array
+* @param  g_right_count  helper array
+* @param  lg             lower bound of input interval (e.g. Gerschgorin 
interval)
+* @param  ug             upper bound of input interval (e.g. Gerschgorin 
interval)
+* @param  lg_eig_count   number of eigenvalues that are smaller than lg
+* @param  ug_eig_count   number of eigenvalues that are smaller than lu
+* @param  epsilon        desired accuracy of eigenvalues to compute
+*/
+template<typename NumericT>
+__global__
+void
+bisectKernelSmall(const NumericT *g_d, const NumericT *g_s, const unsigned int 
n,
+             NumericT * g_left, NumericT *g_right,
+             unsigned int *g_left_count, unsigned int *g_right_count,
+             const NumericT lg, const NumericT ug,
+             const unsigned int lg_eig_count, const unsigned int ug_eig_count,
+             NumericT epsilon
+            )
+{
+    // intervals (store left and right because the subdivision tree is in 
general
+    // not dense
+    __shared__  NumericT  
s_left[VIENNACL_BISECT_MAX_THREADS_BLOCK_SMALL_MATRIX];
+    __shared__  NumericT  
s_right[VIENNACL_BISECT_MAX_THREADS_BLOCK_SMALL_MATRIX];
+
+    // number of eigenvalues that are smaller than s_left / s_right
+    // (correspondence is realized via indices)
+    __shared__  unsigned int  
s_left_count[VIENNACL_BISECT_MAX_THREADS_BLOCK_SMALL_MATRIX];
+    __shared__  unsigned int  
s_right_count[VIENNACL_BISECT_MAX_THREADS_BLOCK_SMALL_MATRIX];
+
+    // helper for stream compaction
+    __shared__  unsigned int
+    s_compaction_list[VIENNACL_BISECT_MAX_THREADS_BLOCK_SMALL_MATRIX + 1];
+
+    // state variables for whole block
+    // if 0 then compaction of second chunk of child intervals is not necessary
+    // (because all intervals had exactly one non-dead child)
+    __shared__  unsigned int compact_second_chunk;
+    __shared__  unsigned int all_threads_converged;
+
+    // number of currently active threads
+    __shared__  unsigned int num_threads_active;
+
+    // number of threads to use for stream compaction
+    __shared__  unsigned int num_threads_compaction;
+
+    // helper for exclusive scan
+    unsigned int *s_compaction_list_exc = s_compaction_list + 1;
+
+
+    // variables for currently processed interval
+    // left and right limit of active interval
+    NumericT  left = 0.0f;
+    NumericT  right = 0.0f;
+    unsigned int left_count = 0;
+    unsigned int right_count = 0;
+    // midpoint of active interval
+    NumericT  mid = 0.0f;
+    // number of eigenvalues smaller then mid
+    unsigned int mid_count = 0;
+    // affected from compaction
+    unsigned int  is_active_second = 0;
+
+    s_compaction_list[threadIdx.x] = 0;
+    s_left[threadIdx.x] = 0;
+    s_right[threadIdx.x] = 0;
+    s_left_count[threadIdx.x] = 0;
+    s_right_count[threadIdx.x] = 0;
+
+    __syncthreads();
+
+    // set up initial configuration
+    if (0 == threadIdx.x)
+    {
+        s_left[0] = lg;
+        s_right[0] = ug;
+        s_left_count[0] = lg_eig_count;
+        s_right_count[0] = ug_eig_count;
+
+        compact_second_chunk = 0;
+        num_threads_active = 1;
+
+        num_threads_compaction = 1;
+    }
+
+    // for all active threads read intervals from the last level
+    // the number of (worst case) active threads per level l is 2^l
+    while (true)
+    {
+
+        all_threads_converged = 1;
+        __syncthreads();
+
+        is_active_second = 0;
+        subdivideActiveIntervalMulti(threadIdx.x,
+                                s_left, s_right, s_left_count, s_right_count,
+                                num_threads_active,
+                                left, right, left_count, right_count,
+                                mid, all_threads_converged);
+
+        __syncthreads();
+
+        // check if done
+        if (1 == all_threads_converged)
+        {
+            break;
+        }
+
+        __syncthreads();
+
+        // compute number of eigenvalues smaller than mid
+        // use all threads for reading the necessary matrix data from global
+        // memory
+        // use s_left and s_right as scratch space for diagonal and
+        // superdiagonal of matrix
+        mid_count = computeNumSmallerEigenvals(g_d, g_s, n, mid,
+                                               threadIdx.x, num_threads_active,
+                                               s_left, s_right,
+                                               (left == right));
+
+        __syncthreads();
+
+        // store intervals
+        // for all threads store the first child interval in a continuous 
chunk of
+        // memory, and the second child interval -- if it exists -- in a second
+        // chunk; it is likely that all threads reach convergence up to
+        // \a epsilon at the same level; furthermore, for higher level most / 
all
+        // threads will have only one child, storing the first child compactly 
will
+        // (first) avoid to perform a compaction step on the first chunk, 
(second)
+        // make it for higher levels (when all threads / intervals have
+        // exactly one child)  unnecessary to perform a compaction of the 
second
+        // chunk
+        if (threadIdx.x < num_threads_active)
+        {
+
+            if (left != right)
+            {
+
+                // store intervals
+                storeNonEmptyIntervals(threadIdx.x, num_threads_active,
+                                       s_left, s_right, s_left_count, 
s_right_count,
+                                       left, mid, right,
+                                       left_count, mid_count, right_count,
+                                       epsilon, compact_second_chunk,
+                                       s_compaction_list_exc,
+                                       is_active_second);
+            }
+            else
+            {
+
+                storeIntervalConverged(s_left, s_right, s_left_count, 
s_right_count,
+                                       left, mid, right,
+                                       left_count, mid_count, right_count,
+                                       s_compaction_list_exc, 
compact_second_chunk,
+                                       num_threads_active,
+                                       is_active_second);
+            }
+        }
+
+        // necessary so that compact_second_chunk is up-to-date
+        __syncthreads();
+
+        // perform compaction of chunk where second children are stored
+        // scan of (num_threads_active / 2) elements, thus at most
+        // (num_threads_active / 4) threads are needed
+        if (compact_second_chunk > 0)
+        {
+
+            createIndicesCompaction(s_compaction_list_exc, 
num_threads_compaction);
+
+            compactIntervals(s_left, s_right, s_left_count, s_right_count,
+                             mid, right, mid_count, right_count,
+                             s_compaction_list, num_threads_active,
+                             is_active_second);
+        }
+
+        __syncthreads();
+
+        if (0 == threadIdx.x)
+        {
+
+            // update number of active threads with result of reduction
+            num_threads_active += s_compaction_list[num_threads_active];
+
+            num_threads_compaction = ceilPow2(num_threads_active);
+
+            compact_second_chunk = 0;
+        }
+
+        __syncthreads();
+
+    }
+
+    __syncthreads();
+
+    // write resulting intervals to global mem
+    // for all threads write if they have been converged to an eigenvalue to
+    // a separate array
+
+    // at most n valid intervals
+    if (threadIdx.x < n)
+    {
+
+        // intervals converged so left and right limit are identical
+        g_left[threadIdx.x]  = s_left[threadIdx.x];
+        // left count is sufficient to have global order
+        g_left_count[threadIdx.x]  = s_left_count[threadIdx.x];
+    }
+}
+} // namespace cuda
+} // namespace linalg
+} // namespace viennacl
+#endif // #ifndef _BISECT_KERNEL_SMALL_H_

http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/bisect_util.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/bisect_util.hpp 
b/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/bisect_util.hpp
new file mode 100755
index 0000000..e2e262c
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/bisect_util.hpp
@@ -0,0 +1,613 @@
+#ifndef VIENNACL_LINALG_CUDA_BISECT_BISECT_UTIL_HPP_
+#define VIENNACL_LINALG_CUDA_BISECT_BISECT_UTIL_HPP_
+
+/* =========================================================================
+   Copyright (c) 2010-2016, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   [email protected]
+
+   (A list of authors and contributors can be found in the manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= 
*/
+
+
+/** @file viennacl/linalg/cuda/bisect_kernel_small.hpp
+    @brief Utility / shared functionality for bisection kernels
+
+    Implementation based on the sample provided with the CUDA 6.0 SDK, for 
which
+    the creation of derivative works is allowed by including the following 
statement:
+    "This software contains source code provided by NVIDIA Corporation."
+*/
+
+// includes, project
+#include "viennacl/linalg/detail/bisect/config.hpp"
+#include "viennacl/linalg/detail/bisect/util.hpp"
+
+namespace viennacl
+{
+namespace linalg
+{
+namespace cuda
+{
+////////////////////////////////////////////////////////////////////////////////
+//! Compute the next lower power of two of n
+//! @param  n  number for which next higher power of two is seeked
+////////////////////////////////////////////////////////////////////////////////
+__device__
+inline int
+floorPow2(int n)
+{
+
+    // early out if already power of two
+    if (0 == (n & (n-1)))
+    {
+        return n;
+    }
+
+    int exp;
+    frexp((float)n, &exp);
+    return (1 << (exp - 1));
+}
+
+////////////////////////////////////////////////////////////////////////////////
+//! Compute the next higher power of two of n
+//! @param  n  number for which next higher power of two is seeked
+////////////////////////////////////////////////////////////////////////////////
+__device__
+inline int
+ceilPow2(int n)
+{
+
+    // early out if already power of two
+    if (0 == (n & (n-1)))
+    {
+        return n;
+    }
+
+    int exp;
+    frexp((float)n, &exp);
+    return (1 << exp);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+//! Compute midpoint of interval [\a left, \a right] avoiding overflow if
+//! possible
+//! @param left   left / lower limit of interval
+//! @param right  right / upper limit of interval
+////////////////////////////////////////////////////////////////////////////////
+template<typename NumericT>
+__device__
+inline NumericT
+computeMidpoint(const NumericT left, const NumericT right)
+{
+
+    NumericT mid;
+
+    if (viennacl::linalg::detail::sign_f(left) == 
viennacl::linalg::detail::sign_f(right))
+    {
+        mid = left + (right - left) * 0.5f;
+    }
+    else
+    {
+        mid = (left + right) * 0.5f;
+    }
+
+    return mid;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+//! Check if interval converged and store appropriately
+//! @param  addr    address where to store the information of the interval
+//! @param  s_left  shared memory storage for left interval limits
+//! @param  s_right  shared memory storage for right interval limits
+//! @param  s_left_count  shared memory storage for number of eigenvalues less
+//!                       than left interval limits
+//! @param  s_right_count  shared memory storage for number of eigenvalues less
+//!                       than right interval limits
+//! @param  left   lower limit of interval
+//! @param  right  upper limit of interval
+//! @param  left_count  eigenvalues less than \a left
+//! @param  right_count  eigenvalues less than \a right
+//! @param  precision  desired precision for eigenvalues
+////////////////////////////////////////////////////////////////////////////////
+template<class S, class T, class NumericT>
+__device__
+void
+storeInterval(unsigned int addr,
+              NumericT *s_left, NumericT *s_right,
+              T *s_left_count, T *s_right_count,
+              NumericT left, NumericT right,
+              S left_count, S right_count,
+              NumericT precision)
+{
+    s_left_count[addr] = left_count;
+    s_right_count[addr] = right_count;
+
+    // check if interval converged
+    NumericT t0 = abs(right - left);
+    NumericT t1 = max(abs(left), abs(right)) * precision;
+
+    if (t0 <= max(static_cast<NumericT>(VIENNACL_BISECT_MIN_ABS_INTERVAL), t1))
+    {
+        // compute mid point
+        NumericT lambda = computeMidpoint(left, right);
+
+        // mark as converged
+        s_left[addr] = lambda;
+        s_right[addr] = lambda;
+    }
+    else
+    {
+
+        // store current limits
+        s_left[addr] = left;
+        s_right[addr] = right;
+    }
+}
+
+////////////////////////////////////////////////////////////////////////////////
+//! Compute number of eigenvalues that are smaller than x given a symmetric,
+//! real, and tridiagonal matrix
+//! @param  g_d  diagonal elements stored in global memory
+//! @param  g_s  superdiagonal elements stored in global memory
+//! @param  n    size of matrix
+//! @param  x    value for which the number of eigenvalues that are smaller is
+//!              seeked
+//! @param  tid  thread identified (e.g. threadIdx.x or gtid)
+//! @param  num_intervals_active  number of active intervals / threads that
+//!                               currently process an interval
+//! @param  s_d  scratch space to store diagonal entries of the tridiagonal
+//!              matrix in shared memory
+//! @param  s_s  scratch space to store superdiagonal entries of the 
tridiagonal
+//!              matrix in shared memory
+//! @param  converged  flag if the current thread is already converged (that
+//!         is count does not have to be computed)
+////////////////////////////////////////////////////////////////////////////////
+template<typename NumericT>
+__device__
+inline unsigned int
+computeNumSmallerEigenvals(const NumericT *g_d, const NumericT *g_s, const 
unsigned int n,
+                           const NumericT x,
+                           const unsigned int tid,
+                           const unsigned int num_intervals_active,
+                           NumericT *s_d, NumericT *s_s,
+                           unsigned int converged
+                          )
+{
+
+    NumericT  delta = 1.0f;
+    unsigned int count = 0;
+
+    __syncthreads();
+
+    // read data into shared memory
+    if (threadIdx.x < n)
+    {
+        s_d[threadIdx.x] = *(g_d + threadIdx.x);
+        s_s[threadIdx.x] = *(g_s + threadIdx.x - 1);
+    }
+
+    __syncthreads();
+
+    // perform loop only for active threads
+    if ((tid < num_intervals_active) && (0 == converged))
+    {
+
+        // perform (optimized) Gaussian elimination to determine the number
+        // of eigenvalues that are smaller than n
+        for (unsigned int k = 0; k < n; ++k)
+        {
+            delta = s_d[k] - x - (s_s[k] * s_s[k]) / delta;
+            count += (delta < 0) ? 1 : 0;
+        }
+
+    }  // end if thread currently processing an interval
+
+    return count;
+}
+////////////////////////////////////////////////////////////////////////////////
+//! Compute number of eigenvalues that are smaller than x given a symmetric,
+//! real, and tridiagonal matrix
+//! @param  g_d  diagonal elements stored in global memory
+//! @param  g_s  superdiagonal elements stored in global memory
+//! @param  n    size of matrix
+//! @param  x    value for which the number of eigenvalues that are smaller is
+//!              seeked
+//! @param  tid  thread identified (e.g. threadIdx.x or gtid)
+//! @param  num_intervals_active  number of active intervals / threads that
+//!                               currently process an interval
+//! @param  s_d  scratch space to store diagonal entries of the tridiagonal
+//!              matrix in shared memory
+//! @param  s_s  scratch space to store superdiagonal entries of the 
tridiagonal
+//!              matrix in shared memory
+//! @param  converged  flag if the current thread is already converged (that
+//!         is count does not have to be computed)
+////////////////////////////////////////////////////////////////////////////////
+template<typename NumericT>
+__device__
+inline unsigned int
+computeNumSmallerEigenvalsLarge(const NumericT *g_d, const NumericT *g_s, 
const unsigned int n,
+                                const NumericT x,
+                                const unsigned int tid,
+                                const unsigned int num_intervals_active,
+                                NumericT *s_d, NumericT *s_s,
+                                unsigned int converged
+                               )
+{
+    NumericT  delta = 1.0f;
+    unsigned int count = 0;
+
+    unsigned int rem = n;
+
+    // do until whole diagonal and superdiagonal has been loaded and processed
+    for (unsigned int i = 0; i < n; i += blockDim.x)
+    {
+
+        __syncthreads();
+
+        // read new chunk of data into shared memory
+        if ((i + threadIdx.x) < n)
+        {
+
+            s_d[threadIdx.x] = *(g_d + i + threadIdx.x);
+            s_s[threadIdx.x] = *(g_s + i + threadIdx.x - 1);
+        }
+
+        __syncthreads();
+
+
+        if (tid < num_intervals_active)
+        {
+
+            // perform (optimized) Gaussian elimination to determine the number
+            // of eigenvalues that are smaller than n
+            for (unsigned int k = 0; k < min(rem,blockDim.x); ++k)
+            {
+                delta = s_d[k] - x - (s_s[k] * s_s[k]) / delta;
+                // delta = (abs( delta) < (1.0e-10)) ? -(1.0e-10) : delta;
+                count += (delta < 0) ? 1 : 0;
+            }
+
+        }  // end if thread currently processing an interval
+
+        rem -= blockDim.x;
+    }
+
+    return count;
+}
+
+/** @brief Store all non-empty intervals resulting from the subdivision of the 
interval currently processed by the thread.
+*
+* @param  addr                   base address for storing intervals
+* @param  num_threads_active     number of threads / intervals in current sweep
+* @param  s_left                 shared memory storage for left interval limits
+* @param  s_right                shared memory storage for right interval 
limits
+* @param  s_left_count           shared memory storage for number of 
eigenvalues less than left interval limits
+* @param  s_right_count          shared memory storage for number of 
eigenvalues less than right interval limits
+* @param  left                   lower limit of interval
+* @param  mid                    midpoint of interval
+* @param  right                  upper limit of interval
+* @param  left_count             eigenvalues less than \a left
+* @param  mid_count              eigenvalues less than \a mid
+* @param  right_count            eigenvalues less than \a right
+* @param  precision              desired precision for eigenvalues
+* @param  compact_second_chunk   shared mem flag if second chunk is used and 
ergo requires compaction
+* @param  s_compaction_list_exc  helper array for stream compaction, 
s_compaction_list_exc[tid] = 1 when the thread generated two child intervals
+* @param  is_active_second       mark is thread has a second non-empty child 
interval
+*/
+template<class S, class T, class NumericT>
+__device__
+void
+storeNonEmptyIntervals(unsigned int addr,
+                       const unsigned int num_threads_active,
+                       NumericT  *s_left, NumericT *s_right,
+                       T  *s_left_count, T *s_right_count,
+                       NumericT left, NumericT mid, NumericT right,
+                       const S left_count,
+                       const S mid_count,
+                       const S right_count,
+                       NumericT precision,
+                       unsigned int &compact_second_chunk,
+                       T *s_compaction_list_exc,
+                       unsigned int &is_active_second)
+{
+    // check if both child intervals are valid
+
+    if ((left_count != mid_count) && (mid_count != right_count))
+    {
+
+        // store the left interval
+        storeInterval(addr, s_left, s_right, s_left_count, s_right_count,
+                      left, mid, left_count, mid_count, precision);
+
+        // mark that a second interval has been generated, only stored after
+        // stream compaction of second chunk
+        is_active_second = 1;
+        s_compaction_list_exc[threadIdx.x] = 1;
+        compact_second_chunk = 1;
+    }
+    else
+    {
+
+        // only one non-empty child interval
+
+        // mark that no second child
+        is_active_second = 0;
+        s_compaction_list_exc[threadIdx.x] = 0;
+
+        // store the one valid child interval
+        if (left_count != mid_count)
+        {
+            storeInterval(addr, s_left, s_right, s_left_count, s_right_count,
+                          left, mid, left_count, mid_count, precision);
+        }
+        else
+        {
+            storeInterval(addr, s_left, s_right, s_left_count, s_right_count,
+                          mid, right, mid_count, right_count, precision);
+        }
+
+    }
+}
+////////////////////////////////////////////////////////////////////////////////
+//! Create indices for compaction, that is process \a s_compaction_list_exc
+//! which is 1 for intervals that generated a second child and 0 otherwise
+//! and create for each of the non-zero elements the index where the new
+//! interval belongs to in a compact representation of all generated second
+//! childs
+//! @param   s_compaction_list_exc  list containing the flags which threads
+//!                                 generated two childs
+//! @param   num_threads_compaction number of threads to employ for compaction
+////////////////////////////////////////////////////////////////////////////////
+template<class T>
+__device__
+void
+createIndicesCompaction(T *s_compaction_list_exc,
+                        unsigned int num_threads_compaction)
+{
+
+    unsigned int offset = 1;
+    const unsigned int tid = threadIdx.x;
+   // if(tid == 0)
+     // printf("num_threads_compaction = %u\n", num_threads_compaction);
+
+    // higher levels of scan tree
+    for (int d = (num_threads_compaction >> 1); d > 0; d >>= 1)
+    {
+
+        __syncthreads();
+
+        if (tid < d)
+        {
+
+            unsigned int  ai = offset*(2*tid+1)-1;
+            unsigned int  bi = offset*(2*tid+2)-1;
+
+            s_compaction_list_exc[bi] =   s_compaction_list_exc[bi]
+                                          + s_compaction_list_exc[ai];
+        }
+
+        offset <<= 1;
+    }
+
+    // traverse down tree: first down to level 2 across
+    for (int d = 2; d < num_threads_compaction; d <<= 1)
+    {
+
+        offset >>= 1;
+        __syncthreads();
+
+        if (tid < (d-1))
+        {
+
+            unsigned int  ai = offset*(tid+1) - 1;
+            unsigned int  bi = ai + (offset >> 1);
+
+            s_compaction_list_exc[bi] =   s_compaction_list_exc[bi]
+                                          + s_compaction_list_exc[ai];
+        }
+    }
+
+    __syncthreads();
+
+}
+
+/** @brief Perform stream compaction for second child intervals
+*
+* @param  s_left              shared memory storage for left interval limits
+* @param  s_right             shared memory storage for right interval limits
+* @param  s_left_count        shared memory storage for number of eigenvalues 
less than left interval limits
+* @param  s_right_count       shared memory storage for number of eigenvalues 
less than right interval limits
+* @param  mid                 midpoint of current interval (left of new 
interval)
+* @param  right               upper limit of interval
+* @param  mid_count           eigenvalues less than mid
+* @param  right_count         eigenvalues less than right
+* @param  s_compaction_list   list containing the indices where the data has 
to be stored
+* @param  num_threads_active  number of active threads / intervals
+* @param  is_active_second    mark is thread has a second non-empty child 
interval
+*/
+template<class T, class NumericT>
+__device__
+void
+compactIntervals(NumericT *s_left, NumericT *s_right,
+                 T *s_left_count, T *s_right_count,
+                 NumericT mid, NumericT right,
+                 unsigned int mid_count, unsigned int right_count,
+                 T *s_compaction_list,
+                 unsigned int num_threads_active,
+                 unsigned int is_active_second)
+{
+    const unsigned int tid = threadIdx.x;
+
+    // perform compaction / copy data for all threads where the second
+    // child is not dead
+    if ((tid < num_threads_active) && (1 == is_active_second))
+    {
+        unsigned int addr_w = num_threads_active + s_compaction_list[tid];
+        s_left[addr_w] = mid;
+        s_right[addr_w] = right;
+        s_left_count[addr_w] = mid_count;
+        s_right_count[addr_w] = right_count;
+    }
+}
+
+template<class T, class S, class NumericT>
+__device__
+void
+storeIntervalConverged(NumericT *s_left, NumericT *s_right,
+                       T *s_left_count, T *s_right_count,
+                       NumericT &left, NumericT &mid, NumericT &right,
+                       S &left_count, S &mid_count, S &right_count,
+                       T *s_compaction_list_exc,
+                       unsigned int &compact_second_chunk,
+                       const unsigned int num_threads_active,
+                       unsigned int &is_active_second)
+{
+    const unsigned int tid = threadIdx.x;
+    const unsigned int multiplicity = right_count - left_count;
+    // check multiplicity of eigenvalue
+    if (1 == multiplicity)
+    {
+
+        // just re-store intervals, simple eigenvalue
+        s_left[tid] = left;
+        s_right[tid] = right;
+        s_left_count[tid] = left_count;
+        s_right_count[tid] = right_count;
+
+
+        // mark that no second child / clear
+        is_active_second = 0;
+        s_compaction_list_exc[tid] = 0;
+    }
+    else
+    {
+
+        // number of eigenvalues after the split less than mid
+        mid_count = left_count + (multiplicity >> 1);
+
+        // store left interval
+        s_left[tid] = left;
+        s_right[tid] = right;
+        s_left_count[tid] = left_count;
+        s_right_count[tid] = mid_count;
+        mid = left;
+
+        // mark that second child interval exists
+        is_active_second = 1;
+        s_compaction_list_exc[tid] = 1;
+        compact_second_chunk = 1;
+    }
+}
+
+/** @brief Subdivide interval if active and not already converged.
+*
+* @param  tid                    id of thread
+* @param  s_left                 shared memory storage for left interval limits
+* @param  s_right                shared memory storage for right interval 
limits
+* @param  s_left_count           shared memory storage for number of 
eigenvalues less than left interval limits
+* @param  s_right_count          shared memory storage for number of 
eigenvalues less than right interval limits
+* @param  num_threads_active     number of active threads in warp
+* @param  left                   lower limit of interval
+* @param  right                  upper limit of interval
+* @param  left_count             eigenvalues less than \a left
+* @param  right_count            eigenvalues less than \a right
+* @param  mid                    median of interval
+* @param  all_threads_converged  shared memory flag if all threads are
+*/
+template<class T, class NumericT>
+__device__
+void
+subdivideActiveIntervalMulti(const unsigned int tid,
+                        NumericT *s_left, NumericT *s_right,
+                        T *s_left_count, T *s_right_count,
+                        const unsigned int num_threads_active,
+                        NumericT &left, NumericT &right,
+                        unsigned int &left_count, unsigned int &right_count,
+                        NumericT &mid, unsigned int &all_threads_converged)
+{
+  // for all active threads
+  if (tid < num_threads_active)
+  {
+
+    left = s_left[tid];
+    right = s_right[tid];
+    left_count = s_left_count[tid];
+    right_count = s_right_count[tid];
+
+    // check if thread already converged
+    if (left != right)
+    {
+
+      mid = computeMidpoint(left, right);
+      all_threads_converged = 0;
+    }
+    else if ((right_count - left_count) > 1)
+    {
+      // mark as not converged if multiple eigenvalues enclosed
+      // duplicate interval in storeIntervalsConverged()
+      all_threads_converged = 0;
+    }
+
+  }  // end for all active threads
+}
+
+
+/** @brief Subdivide interval if active and not already converged.
+*
+* @param  tid                    id of thread
+* @param  s_left                 shared memory storage for left interval limits
+* @param  s_right                shared memory storage for right interval 
limits
+* @param  s_left_count           shared memory storage for number of 
eigenvalues less than left interval limits
+* @param  s_right_count          shared memory storage for number of 
eigenvalues less than right interval limits
+* @param  num_threads_active     number of active threads in warp
+* @param  left                   lower limit of interval
+* @param  right                  upper limit of interval
+* @param  left_count             eigenvalues less than \a left
+* @param  right_count            eigenvalues less than \a right
+* @param  mid                    median of interval
+* @param  all_threads_converged  shared memory flag if all threads are
+*/
+template<class T, class NumericT>
+__device__
+void
+subdivideActiveInterval(const unsigned int tid,
+                        NumericT *s_left, NumericT *s_right,
+                        T *s_left_count, T *s_right_count,
+                        const unsigned int num_threads_active,
+                        NumericT &left, NumericT &right,
+                        unsigned int &left_count, unsigned int &right_count,
+                        NumericT &mid, unsigned int &all_threads_converged)
+{
+  // for all active threads
+  if (tid < num_threads_active)
+  {
+
+    left = s_left[tid];
+    right = s_right[tid];
+    left_count = s_left_count[tid];
+    right_count = s_right_count[tid];
+
+    // check if thread already converged
+    if (left != right)
+    {
+
+      mid = computeMidpoint(left, right);
+      all_threads_converged = 0;
+    }
+  }  // end for all active threads
+}
+}
+}
+}
+
+#endif // #ifndef VIENNACL_LINALG_DETAIL_BISECT_UTIL_HPP_
+

http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/common.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/common.hpp 
b/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/common.hpp
new file mode 100644
index 0000000..3622b89
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/common.hpp
@@ -0,0 +1,250 @@
+#ifndef VIENNACL_LINALG_CUDA_COMMON_HPP_
+#define VIENNACL_LINALG_CUDA_COMMON_HPP_
+
+/* =========================================================================
+   Copyright (c) 2010-2016, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   [email protected]
+
+   (A list of authors and contributors can be found in the manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= 
*/
+
+/** @file viennacl/linalg/cuda/common.hpp
+    @brief Common routines for CUDA execution
+*/
+
+#include <sstream>
+#include <cuda_runtime.h>
+#include "viennacl/backend/cuda.hpp"
+#include "viennacl/traits/handle.hpp"
+
+#define VIENNACL_CUDA_LAST_ERROR_CHECK(message)  detail::cuda_last_error_check 
(message, __FILE__, __LINE__)
+
+namespace viennacl
+{
+
+////// scalar
+
+/** @brief Convenience helper function for extracting the CUDA handle from a 
ViennaCL scalar. Non-const version. */
+template<typename NumericT>
+NumericT * cuda_arg(scalar<NumericT> & obj)
+{
+  return reinterpret_cast<NumericT 
*>(viennacl::traits::handle(obj).cuda_handle().get());
+}
+
+/** @brief Convenience helper function for extracting the CUDA handle from a 
ViennaCL scalar. Const version. */
+template<typename NumericT>
+const NumericT * cuda_arg(scalar<NumericT> const & obj)
+{
+  return reinterpret_cast<const NumericT 
*>(viennacl::traits::handle(obj).cuda_handle().get());
+}
+
+
+////// vector_base
+
+/** @brief Convenience helper function for extracting the CUDA handle from a 
ViennaCL vector (through the base class vector_base) with implicit return type 
deduction. Non-const version. */
+template<typename NumericT>
+NumericT * cuda_arg(vector_base<NumericT> & obj)
+{
+  return reinterpret_cast<NumericT 
*>(viennacl::traits::handle(obj).cuda_handle().get());
+}
+
+/** @brief Convenience helper function for extracting the CUDA handle from a 
ViennaCL vector (through the base class vector_base) with implicit return type 
deduction. Const version. */
+template<typename NumericT>
+const NumericT * cuda_arg(vector_base<NumericT> const & obj)
+{
+  return reinterpret_cast<const NumericT 
*>(viennacl::traits::handle(obj).cuda_handle().get());
+}
+
+/** @brief Convenience helper function for extracting the CUDA handle from a 
ViennaCL vector (through the base class vector_base). Return type needs to be 
explicitly provided as first template argument. Non-const version. */
+template<typename ReturnT, typename NumericT>
+ReturnT * cuda_arg(vector_base<NumericT> & obj)
+{
+  return reinterpret_cast<ReturnT 
*>(viennacl::traits::handle(obj).cuda_handle().get());
+}
+
+/** @brief Convenience helper function for extracting the CUDA handle from a 
ViennaCL vector (through the base class vector_base). Return type needs to be 
explicitly provided as first template argument. Const version. */
+template<typename ReturnT, typename NumericT>
+const ReturnT * cuda_arg(vector_base<NumericT> const & obj)
+{
+  return reinterpret_cast<const ReturnT 
*>(viennacl::traits::handle(obj).cuda_handle().get());
+}
+
+
+////// matrix_base
+
+/** @brief Convenience helper function for extracting the CUDA handle from a 
ViennaCL matrix (through the base class matrix_base). Non-const version. */
+template<typename NumericT>
+NumericT * cuda_arg(matrix_base<NumericT> & obj)
+{
+  return reinterpret_cast<NumericT 
*>(viennacl::traits::handle(obj).cuda_handle().get());
+}
+
+/** @brief Convenience helper function for extracting the CUDA handle from a 
ViennaCL matrix (through the base class matrix_base). Const version. */
+template<typename NumericT>
+const NumericT * cuda_arg(matrix_base<NumericT> const & obj)
+{
+  return reinterpret_cast<const NumericT 
*>(viennacl::traits::handle(obj).cuda_handle().get());
+}
+
+
+
+////// mem_handle
+
+
+/** @brief Convenience helper function for extracting the CUDA handle from a 
generic memory handle. Non-const version. */
+template<typename ReturnT>
+ReturnT * cuda_arg(viennacl::backend::mem_handle & h)
+{
+  return reinterpret_cast<ReturnT *>(h.cuda_handle().get());
+}
+
+/** @brief Convenience helper function for extracting the CUDA handle from a 
generic memory handle. Const-version. */
+template<typename ReturnT>
+ReturnT const * cuda_arg(viennacl::backend::mem_handle const & h)
+{
+  return reinterpret_cast<const ReturnT *>(h.cuda_handle().get());
+}
+
+/** \cond **/
+template<typename ReturnT>
+ReturnT * cuda_arg(viennacl::backend::mem_handle::cuda_handle_type & h)
+{
+  return reinterpret_cast<ReturnT *>(h.get());
+}
+
+template<typename ReturnT>
+ReturnT const *  cuda_arg(viennacl::backend::mem_handle::cuda_handle_type 
const & h)
+{
+  return reinterpret_cast<const ReturnT *>(h.get());
+}
+
+inline unsigned int cuda_arg(unsigned int val)  { return val; }
+
+template<typename NumericT> char           cuda_arg(char val)           { 
return val; }
+template<typename NumericT> unsigned char  cuda_arg(unsigned char val)  { 
return val; }
+
+template<typename NumericT> short          cuda_arg(short val)          { 
return val; }
+template<typename NumericT> unsigned short cuda_arg(unsigned short val) { 
return val; }
+
+template<typename NumericT> int            cuda_arg(int val)            { 
return val; }
+template<typename NumericT> unsigned int   cuda_arg(unsigned int val)   { 
return val; }
+
+template<typename NumericT> long           cuda_arg(long val)           { 
return val; }
+template<typename NumericT> unsigned long  cuda_arg(unsigned long val)  { 
return val; }
+
+template<typename NumericT> float          cuda_arg(float val)          { 
return val; }
+template<typename NumericT> double         cuda_arg(double val)         { 
return val; }
+
+/** \endcond */
+
+
+namespace linalg
+{
+namespace cuda
+{
+
+
+namespace detail
+{
+
+inline unsigned int make_options(vcl_size_t length, bool reciprocal, bool 
flip_sign)
+{
+  return static_cast<unsigned int>( ((length > 1) ? (static_cast<unsigned 
int>(length) << 2) : 0) + (reciprocal ? 2 : 0) + (flip_sign ? 1 : 0) );
+}
+
+inline void cuda_last_error_check(const char * message, const char * file, 
const int line )
+{
+  cudaError_t error_code = cudaGetLastError();
+
+  if (cudaSuccess != error_code)
+  {
+    std::stringstream ss;
+    ss << file << "(" << line << "): " << ": getLastCudaError() CUDA error " 
<< error_code << ": " << cudaGetErrorString( error_code ) << " @ " << message 
<< std::endl;
+    throw viennacl::backend::cuda::cuda_exception(ss.str(), error_code);
+  }
+}
+
+template<typename NumericT>
+struct type_to_type2;
+
+template<>
+struct type_to_type2<float> { typedef float2  type; };
+
+template<>
+struct type_to_type2<double> { typedef double2  type; };
+
+
+template<typename NumericT, typename OtherT>
+typename viennacl::backend::mem_handle::cuda_handle_type & 
arg_reference(viennacl::scalar<NumericT> & s, OtherT) { return 
s.handle().cuda_handle(); }
+
+template<typename NumericT, typename OtherT>
+typename viennacl::backend::mem_handle::cuda_handle_type const & 
arg_reference(viennacl::scalar<NumericT> const & s, OtherT) { return 
s.handle().cuda_handle(); }
+
+// all other cases where T is not a ViennaCL scalar
+template<typename ArgT>
+typename viennacl::enable_if< viennacl::is_cpu_scalar<ArgT>::value,
+                              char const &>::type
+arg_reference(ArgT, char const & val)  { return val; }
+
+template<typename ArgT>
+typename viennacl::enable_if< viennacl::is_cpu_scalar<ArgT>::value,
+                              unsigned char const &>::type
+arg_reference(ArgT, unsigned char const & val)  { return val; }
+
+template<typename ArgT>
+typename viennacl::enable_if< viennacl::is_cpu_scalar<ArgT>::value,
+                              short const &>::type
+arg_reference(ArgT, short const & val)  { return val; }
+
+template<typename ArgT>
+typename viennacl::enable_if< viennacl::is_cpu_scalar<ArgT>::value,
+                              unsigned short const &>::type
+arg_reference(ArgT, unsigned short const & val)  { return val; }
+
+template<typename ArgT>
+typename viennacl::enable_if< viennacl::is_cpu_scalar<ArgT>::value,
+                              int const &>::type
+arg_reference(ArgT, int const & val)  { return val; }
+
+template<typename ArgT>
+typename viennacl::enable_if< viennacl::is_cpu_scalar<ArgT>::value,
+                              unsigned int const &>::type
+arg_reference(ArgT, unsigned int const & val)  { return val; }
+
+template<typename ArgT>
+typename viennacl::enable_if< viennacl::is_cpu_scalar<ArgT>::value,
+                              long const &>::type
+arg_reference(ArgT, long const & val)  { return val; }
+
+template<typename ArgT>
+typename viennacl::enable_if< viennacl::is_cpu_scalar<ArgT>::value,
+                              unsigned long const &>::type
+arg_reference(ArgT, unsigned long const & val)  { return val; }
+
+template<typename ArgT>
+typename viennacl::enable_if< viennacl::is_cpu_scalar<ArgT>::value,
+                              float const &>::type
+arg_reference(ArgT, float const & val)  { return val; }
+
+template<typename ArgT>
+typename viennacl::enable_if< viennacl::is_cpu_scalar<ArgT>::value,
+                              double const &>::type
+arg_reference(ArgT, double const & val)  { return val; }
+
+} //namespace detail
+} //namespace cuda
+} //namespace linalg
+} //namespace viennacl
+
+
+#endif

http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/direct_solve.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/direct_solve.hpp 
b/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/direct_solve.hpp
new file mode 100644
index 0000000..ae70f9a
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/direct_solve.hpp
@@ -0,0 +1,412 @@
+#ifndef VIENNACL_LINALG_CUDA_DIRECT_SOLVE_HPP
+#define VIENNACL_LINALG_CUDA_DIRECT_SOLVE_HPP
+
+/* =========================================================================
+   Copyright (c) 2010-2016, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   [email protected]
+
+   (A list of authors and contributors can be found in the manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= 
*/
+
+/** @file viennacl/linalg/cuda/direct_solve.hpp
+    @brief Implementations of dense direct solvers using CUDA are found here.
+*/
+
+#include "viennacl/forwards.h"
+#include "viennacl/vector.hpp"
+#include "viennacl/matrix.hpp"
+
+
+#include "viennacl/linalg/cuda/common.hpp"
+
+
+namespace viennacl
+{
+namespace linalg
+{
+namespace cuda
+{
+
+template<typename NumericT>
+__global__ void matrix_matrix_upper_solve_kernel(
+          const NumericT * A,
+          unsigned int A_start1, unsigned int A_start2,
+          unsigned int A_inc1,   unsigned int A_inc2,
+          unsigned int A_size1,  unsigned int A_size2,
+          unsigned int A_internal_size1, unsigned int A_internal_size2,
+          bool row_major_A,
+
+          NumericT * B,
+          unsigned int B_start1, unsigned int B_start2,
+          unsigned int B_inc1,   unsigned int B_inc2,
+          unsigned int B_size1,  unsigned int B_size2,
+          unsigned int B_internal_size1, unsigned int B_internal_size2,
+          bool row_major_B,
+
+          bool unit_diagonal)
+{
+  NumericT temp;
+  NumericT entry_A;
+
+  for (unsigned int row_cnt = 0; row_cnt < A_size1; ++row_cnt)
+  {
+    unsigned int row = A_size1 - 1 - row_cnt;
+
+    if (!unit_diagonal)
+    {
+      __syncthreads();
+
+      if (threadIdx.x == 0)
+      {
+        if (row_major_B)
+          B[(row * B_inc1 + B_start1) * B_internal_size2 + (blockIdx.x * 
B_inc2 + B_start2)] /= (row_major_A) ? A[(row * A_inc1 + A_start1) * 
A_internal_size2 + (row * A_inc2 + A_start2)]
+                                                                               
                               : A[(row * A_inc1 + A_start1) + (row * A_inc2 + 
A_start2)*A_internal_size1];
+        else //if (!row_major_B)
+          B[(row * B_inc1 + B_start1) + (blockIdx.x * B_inc2 + B_start2) * 
B_internal_size1] /= (row_major_A) ? A[(row * A_inc1 + A_start1) * 
A_internal_size2 + (row * A_inc2 + A_start2)]
+                                                                               
                               : A[(row * A_inc1 + A_start1) + (row * A_inc2 + 
A_start2)*A_internal_size1];
+      }
+    }
+
+    __syncthreads();
+
+    if (row_major_B)
+      temp = B[(row * B_inc1 + B_start1) * B_internal_size2 + (blockIdx.x * 
B_inc2 + B_start2)];
+    else //if (!row_major_B)
+      temp = B[(row * B_inc1 + B_start1) + (blockIdx.x * B_inc2 + B_start2) * 
B_internal_size1];
+
+    //eliminate column of op(A) with index 'row' in parallel: " << std::endl;
+    for  (unsigned int elim = threadIdx.x; elim < row; elim += blockDim.x)
+    {
+      if (row_major_A)
+        entry_A = A[(elim * A_inc1 + A_start1) * A_internal_size2 + (row * 
A_inc2 + A_start2)];
+      else //if (!row_major_A)
+        entry_A = A[(elim * A_inc1 + A_start1) + (row * A_inc2 + A_start2) * 
A_internal_size1];
+
+      if (row_major_B)
+        B[(elim * B_inc1 + B_start1) * B_internal_size2 + (blockIdx.x * B_inc2 
+ B_start2)] -= temp * entry_A;
+      else //if (!row_major_B)
+        B[(elim * B_inc1 + B_start1) + (blockIdx.x * B_inc2 + B_start2) * 
B_internal_size1] -= temp * entry_A;
+
+    }
+  }
+}
+
+
+
+template<typename NumericT>
+__global__ void matrix_matrix_lower_solve_kernel(
+          const NumericT * A,
+          unsigned int A_start1, unsigned int A_start2,
+          unsigned int A_inc1,   unsigned int A_inc2,
+          unsigned int A_size1,  unsigned int A_size2,
+          unsigned int A_internal_size1, unsigned int A_internal_size2,
+          bool row_major_A,
+
+          NumericT * B,
+          unsigned int B_start1, unsigned int B_start2,
+          unsigned int B_inc1,   unsigned int B_inc2,
+          unsigned int B_size1,  unsigned int B_size2,
+          unsigned int B_internal_size1, unsigned int B_internal_size2,
+          bool row_major_B,
+
+          bool unit_diagonal)
+{
+  NumericT temp;
+  NumericT entry_A;
+
+  for (unsigned int row = 0; row < A_size1; ++row)
+  {
+
+    if (!unit_diagonal)
+    {
+      __syncthreads();
+
+      if (threadIdx.x == 0)
+      {
+        if (row_major_B)
+          B[(row * B_inc1 + B_start1) * B_internal_size2 + (blockIdx.x * 
B_inc2 + B_start2)] /= (row_major_A) ? A[(row * A_inc1 + A_start1) * 
A_internal_size2 + (row * A_inc2 + A_start2)]
+                                                                               
                               : A[(row * A_inc1 + A_start1) + (row * A_inc2 + 
A_start2)*A_internal_size1];
+        else //if (!row_major_B)
+          B[(row * B_inc1 + B_start1) + (blockIdx.x * B_inc2 + B_start2) * 
B_internal_size1] /= (row_major_A) ? A[(row * A_inc1 + A_start1) * 
A_internal_size2 + (row * A_inc2 + A_start2)]
+                                                                               
                               : A[(row * A_inc1 + A_start1) + (row * A_inc2 + 
A_start2)*A_internal_size1];
+      }
+    }
+
+    __syncthreads();
+
+    if (row_major_B)
+      temp = B[(row * B_inc1 + B_start1) * B_internal_size2 + (blockIdx.x * 
B_inc2 + B_start2)];
+    else //if (!row_major_B)
+      temp = B[(row * B_inc1 + B_start1) + (blockIdx.x * B_inc2 + B_start2) * 
B_internal_size1];
+
+    //eliminate column of op(A) with index 'row' in parallel: " << std::endl;
+    for  (unsigned int elim = row + threadIdx.x + 1; elim < A_size1; elim += 
blockDim.x)
+    {
+      if (row_major_A)
+        entry_A = A[(elim * A_inc1 + A_start1) * A_internal_size2 + (row * 
A_inc2 + A_start2)];
+      else //if (!row_major_A)
+        entry_A = A[(elim * A_inc1 + A_start1) + (row * A_inc2 + A_start2) * 
A_internal_size1];
+
+      if (row_major_B)
+        B[(elim * B_inc1 + B_start1) * B_internal_size2 + (blockIdx.x * B_inc2 
+ B_start2)] -= temp * entry_A;
+      else //if (!row_major_B)
+        B[(elim * B_inc1 + B_start1) + (blockIdx.x * B_inc2 + B_start2) * 
B_internal_size1] -= temp * entry_A;
+
+    }
+  }
+}
+
+
+
+
+
+
+namespace detail
+{
+  template<typename TagT>
+  bool is_unit_solve(TagT const & tag) { return false; }
+
+  inline bool is_unit_solve(viennacl::linalg::unit_lower_tag) { return true; }
+  inline bool is_unit_solve(viennacl::linalg::unit_upper_tag) { return true; }
+
+  template<typename TagT>
+  bool is_upper_solve(TagT const & tag) { return false; }
+
+  inline bool is_upper_solve(viennacl::linalg::upper_tag) { return true; }
+  inline bool is_upper_solve(viennacl::linalg::unit_upper_tag) { return true; }
+
+  template<typename Matrix1T, typename Matrix2T, typename SolverTagT>
+  void inplace_solve_impl(Matrix1T const & A,
+                          Matrix2T & B,
+                          SolverTagT const & tag)
+  {
+    typedef typename viennacl::result_of::cpu_value_type<Matrix1T>::type       
 value_type;
+
+    dim3 threads(128);
+    dim3 grid(B.size2());
+
+    if (is_upper_solve(tag))
+    {
+      matrix_matrix_upper_solve_kernel<<<grid,threads>>>(viennacl::cuda_arg(A),
+                                                         static_cast<unsigned 
int>(viennacl::traits::start1(A)),         static_cast<unsigned 
int>(viennacl::traits::start2(A)),
+                                                         static_cast<unsigned 
int>(viennacl::traits::stride1(A)),        static_cast<unsigned 
int>(viennacl::traits::stride2(A)),
+                                                         static_cast<unsigned 
int>(viennacl::traits::size1(A)),          static_cast<unsigned 
int>(viennacl::traits::size2(A)),
+                                                         static_cast<unsigned 
int>(viennacl::traits::internal_size1(A)), static_cast<unsigned 
int>(viennacl::traits::internal_size2(A)),
+                                                         bool(A.row_major()),
+
+                                                         viennacl::cuda_arg(B),
+                                                         static_cast<unsigned 
int>(viennacl::traits::start1(B)),         static_cast<unsigned 
int>(viennacl::traits::start2(B)),
+                                                         static_cast<unsigned 
int>(viennacl::traits::stride1(B)),        static_cast<unsigned 
int>(viennacl::traits::stride2(B)),
+                                                         static_cast<unsigned 
int>(viennacl::traits::size1(B)),          static_cast<unsigned 
int>(viennacl::traits::size2(B)),
+                                                         static_cast<unsigned 
int>(viennacl::traits::internal_size1(B)), static_cast<unsigned 
int>(viennacl::traits::internal_size2(B)),
+                                                         bool(B.row_major()),
+
+                                                         is_unit_solve(tag)
+                                                        );
+    }
+    else
+    {
+      matrix_matrix_lower_solve_kernel<<<grid,threads>>>(viennacl::cuda_arg(A),
+                                                         static_cast<unsigned 
int>(viennacl::traits::start1(A)),         static_cast<unsigned 
int>(viennacl::traits::start2(A)),
+                                                         static_cast<unsigned 
int>(viennacl::traits::stride1(A)),        static_cast<unsigned 
int>(viennacl::traits::stride2(A)),
+                                                         static_cast<unsigned 
int>(viennacl::traits::size1(A)),          static_cast<unsigned 
int>(viennacl::traits::size2(A)),
+                                                         static_cast<unsigned 
int>(viennacl::traits::internal_size1(A)), static_cast<unsigned 
int>(viennacl::traits::internal_size2(A)),
+                                                         bool(A.row_major()),
+
+                                                         viennacl::cuda_arg(B),
+                                                         static_cast<unsigned 
int>(viennacl::traits::start1(B)),         static_cast<unsigned 
int>(viennacl::traits::start2(B)),
+                                                         static_cast<unsigned 
int>(viennacl::traits::stride1(B)),        static_cast<unsigned 
int>(viennacl::traits::stride2(B)),
+                                                         static_cast<unsigned 
int>(viennacl::traits::size1(B)),          static_cast<unsigned 
int>(viennacl::traits::size2(B)),
+                                                         static_cast<unsigned 
int>(viennacl::traits::internal_size1(B)), static_cast<unsigned 
int>(viennacl::traits::internal_size2(B)),
+                                                         bool(B.row_major()),
+
+                                                         is_unit_solve(tag)
+                                                        );
+    }
+
+  }
+}
+
+
+//
+// Note: By convention, all size checks are performed in the calling frontend. 
No need to double-check here.
+//
+
+////////////////// triangular solver //////////////////////////////////////
+/** @brief Direct inplace solver for triangular systems with multiple right 
hand sides, i.e. A \ B   (MATLAB notation).
+*
+* @param A         The system matrix
+* @param B         The matrix of row vectors, where the solution is directly 
written to
+* @param tag       Solver tag for identifying the respective triangular solver
+*/
+template<typename NumericT, typename SolverTagT>
+void inplace_solve(matrix_base<NumericT> const & A,
+                   matrix_base<NumericT> & B,
+                   SolverTagT tag)
+{
+  detail::inplace_solve_impl(A, B, tag);
+}
+
+
+//
+//  Solve on vector
+//
+
+template<typename NumericT>
+__global__ void triangular_substitute_inplace_row_kernel(
+          NumericT const * A,
+          unsigned int A_start1, unsigned int A_start2,
+          unsigned int A_inc1,   unsigned int A_inc2,
+          unsigned int A_size1,  unsigned int A_size2,
+          unsigned int A_internal_size1,  unsigned int A_internal_size2,
+          NumericT * v,
+          unsigned int v_start,
+          unsigned int v_inc,
+          unsigned int v_size,
+
+          unsigned int options)
+{
+  NumericT temp;
+  unsigned int unit_diagonal_flag  = (options & (1 << 0));
+
+  unsigned int is_lower_solve      = (options & (1 << 2));
+  unsigned int row;
+  for (unsigned int rows_processed = 0; rows_processed < A_size1; 
++rows_processed)    //Note: A required to be square
+  {
+    row = is_lower_solve ? rows_processed : ((A_size1 - rows_processed) - 1);
+    if (!unit_diagonal_flag)
+    {
+      __syncthreads();
+      if (threadIdx.x == 0)
+        v[row * v_inc + v_start] /= A[(row * A_inc1 + A_start1) * 
A_internal_size2 + (row * A_inc2 + A_start2)];
+    }
+
+    __syncthreads();
+
+    temp = v[row * v_inc + v_start];
+
+    for (int elim = (is_lower_solve ? (row + threadIdx.x + 1) : threadIdx.x);
+            elim < (is_lower_solve ? A_size1 : row);
+            elim += blockDim.x)
+      v[elim * v_inc + v_start] -= temp * A[(elim * A_inc1 + A_start1) * 
A_internal_size2 + (row  * A_inc2 + A_start2)];
+  }
+}
+
+
+template<typename NumericT>
+__global__ void triangular_substitute_inplace_col_kernel(
+          NumericT const * A,
+          unsigned int A_start1, unsigned int A_start2,
+          unsigned int A_inc1,   unsigned int A_inc2,
+          unsigned int A_size1,  unsigned int A_size2,
+          unsigned int A_internal_size1,  unsigned int A_internal_size2,
+          NumericT * v,
+          unsigned int v_start,
+          unsigned int v_inc,
+          unsigned int v_size,
+          unsigned int options)
+{
+  NumericT temp;
+  unsigned int unit_diagonal_flag  = (options & (1 << 0));
+
+  unsigned int is_lower_solve      = (options & (1 << 2));
+  unsigned int row;
+  for (unsigned int rows_processed = 0; rows_processed < A_size1; 
++rows_processed)    //Note: A required to be square
+  {
+    row = is_lower_solve ? rows_processed : ((A_size1 - rows_processed) - 1);
+    if (!unit_diagonal_flag)
+    {
+      __syncthreads();
+      if (threadIdx.x == 0)
+        v[row * v_inc + v_start] /= A[(row * A_inc1 + A_start1) + (row * 
A_inc2 + A_start2) * A_internal_size1];
+    }
+
+    __syncthreads();
+
+    temp = v[row * v_inc + v_start];
+
+    for (int elim = (is_lower_solve ? (row + threadIdx.x + 1) : threadIdx.x);
+            elim < (is_lower_solve ? A_size1 : row);
+            elim += blockDim.x)
+      v[elim * v_inc + v_start] -= temp * A[(elim * A_inc1 + A_start1) + (row  
* A_inc2 + A_start2) * A_internal_size1];
+  }
+}
+
+
+namespace detail
+{
+  inline unsigned int get_option_for_solver_tag(viennacl::linalg::upper_tag)   
   { return 0; }
+  inline unsigned int 
get_option_for_solver_tag(viennacl::linalg::unit_upper_tag) { return (1 << 0); }
+  inline unsigned int get_option_for_solver_tag(viennacl::linalg::lower_tag)   
   { return (1 << 2); }
+  inline unsigned int 
get_option_for_solver_tag(viennacl::linalg::unit_lower_tag) { return (1 << 2) | 
(1 << 0); }
+
+  template<typename MatrixT, typename VectorT>
+  void inplace_solve_vector_impl(MatrixT const & mat,
+                                 VectorT & vec,
+                                 unsigned int options)
+  {
+    typedef typename viennacl::result_of::cpu_value_type<MatrixT>::type        
value_type;
+
+    if (mat.row_major())
+    {
+      triangular_substitute_inplace_row_kernel<<<1, 
128>>>(viennacl::cuda_arg(mat),
+                                                           
static_cast<unsigned int>(viennacl::traits::start1(mat)),         
static_cast<unsigned int>(viennacl::traits::start2(mat)),
+                                                           
static_cast<unsigned int>(viennacl::traits::stride1(mat)),        
static_cast<unsigned int>(viennacl::traits::stride2(mat)),
+                                                           
static_cast<unsigned int>(viennacl::traits::size1(mat)),          
static_cast<unsigned int>(viennacl::traits::size2(mat)),
+                                                           
static_cast<unsigned int>(viennacl::traits::internal_size1(mat)), 
static_cast<unsigned int>(viennacl::traits::internal_size2(mat)),
+                                                           
viennacl::cuda_arg(vec),
+                                                           
static_cast<unsigned int>(viennacl::traits::start(vec)),
+                                                           
static_cast<unsigned int>(viennacl::traits::stride(vec)),
+                                                           
static_cast<unsigned int>(viennacl::traits::size(vec)),
+                                                           options
+                                                          );
+    }
+    else
+    {
+      triangular_substitute_inplace_col_kernel<<<1, 
128>>>(viennacl::cuda_arg(mat),
+                                                           
static_cast<unsigned int>(viennacl::traits::start1(mat)),         
static_cast<unsigned int>(viennacl::traits::start2(mat)),
+                                                           
static_cast<unsigned int>(viennacl::traits::stride1(mat)),        
static_cast<unsigned int>(viennacl::traits::stride2(mat)),
+                                                           
static_cast<unsigned int>(viennacl::traits::size1(mat)),          
static_cast<unsigned int>(viennacl::traits::size2(mat)),
+                                                           
static_cast<unsigned int>(viennacl::traits::internal_size1(mat)), 
static_cast<unsigned int>(viennacl::traits::internal_size2(mat)),
+                                                           
viennacl::cuda_arg(vec),
+                                                           
static_cast<unsigned int>(viennacl::traits::start(vec)),
+                                                           
static_cast<unsigned int>(viennacl::traits::stride(vec)),
+                                                           
static_cast<unsigned int>(viennacl::traits::size(vec)),
+                                                           options
+                                                          );
+    }
+  }
+
+}
+
+/** @brief Direct inplace solver for dense triangular systems (non-transposed 
version)
+*
+* @param mat       The system matrix proxy
+* @param vec       The load vector, where the solution is directly written to
+*/
+template<typename NumericT, typename SolverTagT>
+void inplace_solve(matrix_base<NumericT> const & mat,
+                   vector_base<NumericT> & vec,
+                   SolverTagT)
+{
+  unsigned int options = detail::get_option_for_solver_tag(SolverTagT());
+
+  detail::inplace_solve_vector_impl(mat, vec, options);
+}
+
+
+}
+}
+}
+
+#endif

Reply via email to