http://git-wip-us.apache.org/repos/asf/mahout/blob/7ae549fa/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/bisect_kernel_large_onei.hpp
----------------------------------------------------------------------
diff --git 
a/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/bisect_kernel_large_onei.hpp
 
b/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/bisect_kernel_large_onei.hpp
deleted file mode 100755
index 960f5c2..0000000
--- 
a/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/bisect_kernel_large_onei.hpp
+++ /dev/null
@@ -1,180 +0,0 @@
-#ifndef VIENNACL_LINALG_CUDA_BISECT_KERNEL_LARGE_ONEI_HPP_
-#define VIENNACL_LINALG_CUDA_BISECT_KERNEL_LARGE_ONEI_HPP_
-
-/* =========================================================================
-   Copyright (c) 2010-2016, Institute for Microelectronics,
-                            Institute for Analysis and Scientific Computing,
-                            TU Wien.
-   Portions of this software are copyright by UChicago Argonne, LLC.
-
-                            -----------------
-                  ViennaCL - The Vienna Computing Library
-                            -----------------
-
-   Project Head:    Karl Rupp                   [email protected]
-
-   (A list of authors and contributors can be found in the manual)
-
-   License:         MIT (X11), see file LICENSE in the base directory
-============================================================================= 
*/
-
-
-/** @file viennacl/linalg/cuda/bisect_kernel_large_onei.hpp
-    @brief Determine eigenvalues for large matrices for intervals that 
contained after the first step one eigenvalue
-
-    Implementation based on the sample provided with the CUDA 6.0 SDK, for 
which
-    the creation of derivative works is allowed by including the following 
statement:
-    "This software contains source code provided by NVIDIA Corporation."
-*/
-
-// includes, project
-#include "viennacl/linalg/detail/bisect/config.hpp"
-#include "viennacl/linalg/detail/bisect/util.hpp"
-// additional kernel
-#include "viennacl/linalg/cuda/bisect_util.hpp"
-
-namespace viennacl
-{
-namespace linalg
-{
-namespace cuda
-{
-////////////////////////////////////////////////////////////////////////////////
-//! Determine eigenvalues for large matrices for intervals that after
-//! the first step contained one eigenvalue
-//! @param  g_d  diagonal elements of symmetric, tridiagonal matrix
-//! @param  g_s  superdiagonal elements of symmetric, tridiagonal matrix
-//! @param  n    matrix size
-//! @param  num_intervals  total number of intervals containing one eigenvalue
-//!                         after the first step
-//! @param g_left  left interval limits
-//! @param g_right  right interval limits
-//! @param g_pos  index of interval / number of intervals that are smaller than
-//!               right interval limit
-//! @param  precision  desired precision of eigenvalues
-////////////////////////////////////////////////////////////////////////////////
-template<typename NumericT>
-__global__
-void
-bisectKernelLarge_OneIntervals(const NumericT *g_d, const NumericT *g_s, const 
unsigned int n,
-                               unsigned int num_intervals,
-                               NumericT *g_left, NumericT *g_right,
-                               unsigned int *g_pos,
-                               NumericT  precision)
-{
-
-  const unsigned int gtid = (blockDim.x * blockIdx.x) + threadIdx.x;
-
-  __shared__  NumericT  s_left_scratch[VIENNACL_BISECT_MAX_THREADS_BLOCK];
-  __shared__  NumericT  s_right_scratch[VIENNACL_BISECT_MAX_THREADS_BLOCK];
-
-  // active interval of thread
-  // left and right limit of current interval
-  NumericT left, right;
-  // number of threads smaller than the right limit (also corresponds to the
-  // global index of the eigenvalues contained in the active interval)
-  unsigned int right_count;
-  // flag if current thread converged
-  unsigned int converged = 0;
-  // midpoint when current interval is subdivided
-  NumericT mid = 0.0f;
-  // number of eigenvalues less than mid
-  unsigned int mid_count = 0;
-
-  // read data from global memory
-  if (gtid < num_intervals)
-  {
-    left = g_left[gtid];
-    right = g_right[gtid];
-    right_count = g_pos[gtid];
-  }
-
-
-  // flag to determine if all threads converged to eigenvalue
-  __shared__  unsigned int  converged_all_threads;
-
-  // initialized shared flag
-  if (0 == threadIdx.x)
-  {
-    converged_all_threads = 0;
-  }
-
-  __syncthreads();
-
-  // process until all threads converged to an eigenvalue
-  while (true)
-  {
-
-    converged_all_threads = 1;
-
-    // update midpoint for all active threads
-    if ((gtid < num_intervals) && (0 == converged))
-    {
-      mid = computeMidpoint(left, right);
-    }
-
-    // find number of eigenvalues that are smaller than midpoint
-    mid_count = computeNumSmallerEigenvalsLarge(g_d, g_s, n,
-                                                mid, gtid, num_intervals,
-                                                s_left_scratch,
-                                                s_right_scratch,
-                                                converged);
-
-    __syncthreads();
-
-    // for all active threads
-    if ((gtid < num_intervals) && (0 == converged))
-    {
-
-      // update intervals -- always one child interval survives
-      if (right_count == mid_count)
-      {
-        right = mid;
-      }
-      else
-      {
-        left = mid;
-      }
-
-      // check for convergence
-      NumericT t0 = right - left;
-      NumericT t1 = max(abs(right), abs(left)) * precision;
-
-      if (t0 < min(precision, t1))
-      {
-        NumericT lambda = computeMidpoint(left, right);
-        left = lambda;
-        right = lambda;
-
-        converged = 1;
-      }
-      else
-      {
-        converged_all_threads = 0;
-      }
-    }
-
-    __syncthreads();
-
-    if (1 == converged_all_threads)
-    {
-      break;
-    }
-
-    __syncthreads();
-  }
-
-  // write data back to global memory
-  __syncthreads();
-
-  if (gtid < num_intervals)
-  {
-      // intervals converged so left and right interval limit are both 
identical
-      // and identical to the eigenvalue
-      g_left[gtid] = left;
-  }
-}
-} // namespace cuda
-} // namespace linalg
-} // namespace viennacl
-#endif // #ifndef VIENNACL_LINALG_CUDA_BISECT_KERNEL_LARGE_ONEI_HPP_

http://git-wip-us.apache.org/repos/asf/mahout/blob/7ae549fa/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/bisect_kernel_small.hpp
----------------------------------------------------------------------
diff --git 
a/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/bisect_kernel_small.hpp 
b/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/bisect_kernel_small.hpp
deleted file mode 100755
index 310b381..0000000
--- a/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/bisect_kernel_small.hpp
+++ /dev/null
@@ -1,261 +0,0 @@
-#ifndef VIENNACL_LINALG_CUDA_BISECT_KERNEL_SMALL_HPP_
-#define VIENNACL_LINALG_CUDA_BISECT_KERNEL_SMALL_HPP_
-
-/* =========================================================================
-   Copyright (c) 2010-2016, Institute for Microelectronics,
-                            Institute for Analysis and Scientific Computing,
-                            TU Wien.
-   Portions of this software are copyright by UChicago Argonne, LLC.
-
-                            -----------------
-                  ViennaCL - The Vienna Computing Library
-                            -----------------
-
-   Project Head:    Karl Rupp                   [email protected]
-
-   (A list of authors and contributors can be found in the manual)
-
-   License:         MIT (X11), see file LICENSE in the base directory
-============================================================================= 
*/
-
-
-/** @file viennacl/linalg/cuda/bisect_kernel_small.hpp
-    @brief Determine eigenvalues for small symmetric, tridiagonal matrix
-
-    Implementation based on the sample provided with the CUDA 6.0 SDK, for 
which
-    the creation of derivative works is allowed by including the following 
statement:
-    "This software contains source code provided by NVIDIA Corporation."
-*/
-
-// includes, project
-#include "viennacl/linalg/detail/bisect/config.hpp"
-#include "viennacl/linalg/detail/bisect/util.hpp"
-// additional kernel
-#include "viennacl/linalg/cuda/bisect_util.hpp"
-
-namespace viennacl
-{
-namespace linalg
-{
-namespace cuda
-{
-
-/** @brief Bisection to find eigenvalues of a real, symmetric, and tridiagonal 
matrix
-*
-* @param  g_d  diagonal elements in global memory
-* @param  g_s  superdiagonal elements in global elements (stored so that the 
element *(g_s - 1) can be accessed an equals 0
-* @param  n    size of matrix
-* @param  g_left         helper array
-* @param  g_right        helper array
-* @param  g_left_count   helper array
-* @param  g_right_count  helper array
-* @param  lg             lower bound of input interval (e.g. Gerschgorin 
interval)
-* @param  ug             upper bound of input interval (e.g. Gerschgorin 
interval)
-* @param  lg_eig_count   number of eigenvalues that are smaller than lg
-* @param  ug_eig_count   number of eigenvalues that are smaller than lu
-* @param  epsilon        desired accuracy of eigenvalues to compute
-*/
-template<typename NumericT>
-__global__
-void
-bisectKernelSmall(const NumericT *g_d, const NumericT *g_s, const unsigned int 
n,
-             NumericT * g_left, NumericT *g_right,
-             unsigned int *g_left_count, unsigned int *g_right_count,
-             const NumericT lg, const NumericT ug,
-             const unsigned int lg_eig_count, const unsigned int ug_eig_count,
-             NumericT epsilon
-            )
-{
-    // intervals (store left and right because the subdivision tree is in 
general
-    // not dense
-    __shared__  NumericT  
s_left[VIENNACL_BISECT_MAX_THREADS_BLOCK_SMALL_MATRIX];
-    __shared__  NumericT  
s_right[VIENNACL_BISECT_MAX_THREADS_BLOCK_SMALL_MATRIX];
-
-    // number of eigenvalues that are smaller than s_left / s_right
-    // (correspondence is realized via indices)
-    __shared__  unsigned int  
s_left_count[VIENNACL_BISECT_MAX_THREADS_BLOCK_SMALL_MATRIX];
-    __shared__  unsigned int  
s_right_count[VIENNACL_BISECT_MAX_THREADS_BLOCK_SMALL_MATRIX];
-
-    // helper for stream compaction
-    __shared__  unsigned int
-    s_compaction_list[VIENNACL_BISECT_MAX_THREADS_BLOCK_SMALL_MATRIX + 1];
-
-    // state variables for whole block
-    // if 0 then compaction of second chunk of child intervals is not necessary
-    // (because all intervals had exactly one non-dead child)
-    __shared__  unsigned int compact_second_chunk;
-    __shared__  unsigned int all_threads_converged;
-
-    // number of currently active threads
-    __shared__  unsigned int num_threads_active;
-
-    // number of threads to use for stream compaction
-    __shared__  unsigned int num_threads_compaction;
-
-    // helper for exclusive scan
-    unsigned int *s_compaction_list_exc = s_compaction_list + 1;
-
-
-    // variables for currently processed interval
-    // left and right limit of active interval
-    NumericT  left = 0.0f;
-    NumericT  right = 0.0f;
-    unsigned int left_count = 0;
-    unsigned int right_count = 0;
-    // midpoint of active interval
-    NumericT  mid = 0.0f;
-    // number of eigenvalues smaller then mid
-    unsigned int mid_count = 0;
-    // affected from compaction
-    unsigned int  is_active_second = 0;
-
-    s_compaction_list[threadIdx.x] = 0;
-    s_left[threadIdx.x] = 0;
-    s_right[threadIdx.x] = 0;
-    s_left_count[threadIdx.x] = 0;
-    s_right_count[threadIdx.x] = 0;
-
-    __syncthreads();
-
-    // set up initial configuration
-    if (0 == threadIdx.x)
-    {
-        s_left[0] = lg;
-        s_right[0] = ug;
-        s_left_count[0] = lg_eig_count;
-        s_right_count[0] = ug_eig_count;
-
-        compact_second_chunk = 0;
-        num_threads_active = 1;
-
-        num_threads_compaction = 1;
-    }
-
-    // for all active threads read intervals from the last level
-    // the number of (worst case) active threads per level l is 2^l
-    while (true)
-    {
-
-        all_threads_converged = 1;
-        __syncthreads();
-
-        is_active_second = 0;
-        subdivideActiveIntervalMulti(threadIdx.x,
-                                s_left, s_right, s_left_count, s_right_count,
-                                num_threads_active,
-                                left, right, left_count, right_count,
-                                mid, all_threads_converged);
-
-        __syncthreads();
-
-        // check if done
-        if (1 == all_threads_converged)
-        {
-            break;
-        }
-
-        __syncthreads();
-
-        // compute number of eigenvalues smaller than mid
-        // use all threads for reading the necessary matrix data from global
-        // memory
-        // use s_left and s_right as scratch space for diagonal and
-        // superdiagonal of matrix
-        mid_count = computeNumSmallerEigenvals(g_d, g_s, n, mid,
-                                               threadIdx.x, num_threads_active,
-                                               s_left, s_right,
-                                               (left == right));
-
-        __syncthreads();
-
-        // store intervals
-        // for all threads store the first child interval in a continuous 
chunk of
-        // memory, and the second child interval -- if it exists -- in a second
-        // chunk; it is likely that all threads reach convergence up to
-        // \a epsilon at the same level; furthermore, for higher level most / 
all
-        // threads will have only one child, storing the first child compactly 
will
-        // (first) avoid to perform a compaction step on the first chunk, 
(second)
-        // make it for higher levels (when all threads / intervals have
-        // exactly one child)  unnecessary to perform a compaction of the 
second
-        // chunk
-        if (threadIdx.x < num_threads_active)
-        {
-
-            if (left != right)
-            {
-
-                // store intervals
-                storeNonEmptyIntervals(threadIdx.x, num_threads_active,
-                                       s_left, s_right, s_left_count, 
s_right_count,
-                                       left, mid, right,
-                                       left_count, mid_count, right_count,
-                                       epsilon, compact_second_chunk,
-                                       s_compaction_list_exc,
-                                       is_active_second);
-            }
-            else
-            {
-
-                storeIntervalConverged(s_left, s_right, s_left_count, 
s_right_count,
-                                       left, mid, right,
-                                       left_count, mid_count, right_count,
-                                       s_compaction_list_exc, 
compact_second_chunk,
-                                       num_threads_active,
-                                       is_active_second);
-            }
-        }
-
-        // necessary so that compact_second_chunk is up-to-date
-        __syncthreads();
-
-        // perform compaction of chunk where second children are stored
-        // scan of (num_threads_active / 2) elements, thus at most
-        // (num_threads_active / 4) threads are needed
-        if (compact_second_chunk > 0)
-        {
-
-            createIndicesCompaction(s_compaction_list_exc, 
num_threads_compaction);
-
-            compactIntervals(s_left, s_right, s_left_count, s_right_count,
-                             mid, right, mid_count, right_count,
-                             s_compaction_list, num_threads_active,
-                             is_active_second);
-        }
-
-        __syncthreads();
-
-        if (0 == threadIdx.x)
-        {
-
-            // update number of active threads with result of reduction
-            num_threads_active += s_compaction_list[num_threads_active];
-
-            num_threads_compaction = ceilPow2(num_threads_active);
-
-            compact_second_chunk = 0;
-        }
-
-        __syncthreads();
-
-    }
-
-    __syncthreads();
-
-    // write resulting intervals to global mem
-    // for all threads write if they have been converged to an eigenvalue to
-    // a separate array
-
-    // at most n valid intervals
-    if (threadIdx.x < n)
-    {
-
-        // intervals converged so left and right limit are identical
-        g_left[threadIdx.x]  = s_left[threadIdx.x];
-        // left count is sufficient to have global order
-        g_left_count[threadIdx.x]  = s_left_count[threadIdx.x];
-    }
-}
-} // namespace cuda
-} // namespace linalg
-} // namespace viennacl
-#endif // #ifndef _BISECT_KERNEL_SMALL_H_

http://git-wip-us.apache.org/repos/asf/mahout/blob/7ae549fa/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/bisect_util.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/bisect_util.hpp 
b/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/bisect_util.hpp
deleted file mode 100755
index e2e262c..0000000
--- a/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/bisect_util.hpp
+++ /dev/null
@@ -1,613 +0,0 @@
-#ifndef VIENNACL_LINALG_CUDA_BISECT_BISECT_UTIL_HPP_
-#define VIENNACL_LINALG_CUDA_BISECT_BISECT_UTIL_HPP_
-
-/* =========================================================================
-   Copyright (c) 2010-2016, Institute for Microelectronics,
-                            Institute for Analysis and Scientific Computing,
-                            TU Wien.
-   Portions of this software are copyright by UChicago Argonne, LLC.
-
-                            -----------------
-                  ViennaCL - The Vienna Computing Library
-                            -----------------
-
-   Project Head:    Karl Rupp                   [email protected]
-
-   (A list of authors and contributors can be found in the manual)
-
-   License:         MIT (X11), see file LICENSE in the base directory
-============================================================================= 
*/
-
-
-/** @file viennacl/linalg/cuda/bisect_kernel_small.hpp
-    @brief Utility / shared functionality for bisection kernels
-
-    Implementation based on the sample provided with the CUDA 6.0 SDK, for 
which
-    the creation of derivative works is allowed by including the following 
statement:
-    "This software contains source code provided by NVIDIA Corporation."
-*/
-
-// includes, project
-#include "viennacl/linalg/detail/bisect/config.hpp"
-#include "viennacl/linalg/detail/bisect/util.hpp"
-
-namespace viennacl
-{
-namespace linalg
-{
-namespace cuda
-{
-////////////////////////////////////////////////////////////////////////////////
-//! Compute the next lower power of two of n
-//! @param  n  number for which next higher power of two is seeked
-////////////////////////////////////////////////////////////////////////////////
-__device__
-inline int
-floorPow2(int n)
-{
-
-    // early out if already power of two
-    if (0 == (n & (n-1)))
-    {
-        return n;
-    }
-
-    int exp;
-    frexp((float)n, &exp);
-    return (1 << (exp - 1));
-}
-
-////////////////////////////////////////////////////////////////////////////////
-//! Compute the next higher power of two of n
-//! @param  n  number for which next higher power of two is seeked
-////////////////////////////////////////////////////////////////////////////////
-__device__
-inline int
-ceilPow2(int n)
-{
-
-    // early out if already power of two
-    if (0 == (n & (n-1)))
-    {
-        return n;
-    }
-
-    int exp;
-    frexp((float)n, &exp);
-    return (1 << exp);
-}
-
-////////////////////////////////////////////////////////////////////////////////
-//! Compute midpoint of interval [\a left, \a right] avoiding overflow if
-//! possible
-//! @param left   left / lower limit of interval
-//! @param right  right / upper limit of interval
-////////////////////////////////////////////////////////////////////////////////
-template<typename NumericT>
-__device__
-inline NumericT
-computeMidpoint(const NumericT left, const NumericT right)
-{
-
-    NumericT mid;
-
-    if (viennacl::linalg::detail::sign_f(left) == 
viennacl::linalg::detail::sign_f(right))
-    {
-        mid = left + (right - left) * 0.5f;
-    }
-    else
-    {
-        mid = (left + right) * 0.5f;
-    }
-
-    return mid;
-}
-
-////////////////////////////////////////////////////////////////////////////////
-//! Check if interval converged and store appropriately
-//! @param  addr    address where to store the information of the interval
-//! @param  s_left  shared memory storage for left interval limits
-//! @param  s_right  shared memory storage for right interval limits
-//! @param  s_left_count  shared memory storage for number of eigenvalues less
-//!                       than left interval limits
-//! @param  s_right_count  shared memory storage for number of eigenvalues less
-//!                       than right interval limits
-//! @param  left   lower limit of interval
-//! @param  right  upper limit of interval
-//! @param  left_count  eigenvalues less than \a left
-//! @param  right_count  eigenvalues less than \a right
-//! @param  precision  desired precision for eigenvalues
-////////////////////////////////////////////////////////////////////////////////
-template<class S, class T, class NumericT>
-__device__
-void
-storeInterval(unsigned int addr,
-              NumericT *s_left, NumericT *s_right,
-              T *s_left_count, T *s_right_count,
-              NumericT left, NumericT right,
-              S left_count, S right_count,
-              NumericT precision)
-{
-    s_left_count[addr] = left_count;
-    s_right_count[addr] = right_count;
-
-    // check if interval converged
-    NumericT t0 = abs(right - left);
-    NumericT t1 = max(abs(left), abs(right)) * precision;
-
-    if (t0 <= max(static_cast<NumericT>(VIENNACL_BISECT_MIN_ABS_INTERVAL), t1))
-    {
-        // compute mid point
-        NumericT lambda = computeMidpoint(left, right);
-
-        // mark as converged
-        s_left[addr] = lambda;
-        s_right[addr] = lambda;
-    }
-    else
-    {
-
-        // store current limits
-        s_left[addr] = left;
-        s_right[addr] = right;
-    }
-}
-
-////////////////////////////////////////////////////////////////////////////////
-//! Compute number of eigenvalues that are smaller than x given a symmetric,
-//! real, and tridiagonal matrix
-//! @param  g_d  diagonal elements stored in global memory
-//! @param  g_s  superdiagonal elements stored in global memory
-//! @param  n    size of matrix
-//! @param  x    value for which the number of eigenvalues that are smaller is
-//!              seeked
-//! @param  tid  thread identified (e.g. threadIdx.x or gtid)
-//! @param  num_intervals_active  number of active intervals / threads that
-//!                               currently process an interval
-//! @param  s_d  scratch space to store diagonal entries of the tridiagonal
-//!              matrix in shared memory
-//! @param  s_s  scratch space to store superdiagonal entries of the 
tridiagonal
-//!              matrix in shared memory
-//! @param  converged  flag if the current thread is already converged (that
-//!         is count does not have to be computed)
-////////////////////////////////////////////////////////////////////////////////
-template<typename NumericT>
-__device__
-inline unsigned int
-computeNumSmallerEigenvals(const NumericT *g_d, const NumericT *g_s, const 
unsigned int n,
-                           const NumericT x,
-                           const unsigned int tid,
-                           const unsigned int num_intervals_active,
-                           NumericT *s_d, NumericT *s_s,
-                           unsigned int converged
-                          )
-{
-
-    NumericT  delta = 1.0f;
-    unsigned int count = 0;
-
-    __syncthreads();
-
-    // read data into shared memory
-    if (threadIdx.x < n)
-    {
-        s_d[threadIdx.x] = *(g_d + threadIdx.x);
-        s_s[threadIdx.x] = *(g_s + threadIdx.x - 1);
-    }
-
-    __syncthreads();
-
-    // perform loop only for active threads
-    if ((tid < num_intervals_active) && (0 == converged))
-    {
-
-        // perform (optimized) Gaussian elimination to determine the number
-        // of eigenvalues that are smaller than n
-        for (unsigned int k = 0; k < n; ++k)
-        {
-            delta = s_d[k] - x - (s_s[k] * s_s[k]) / delta;
-            count += (delta < 0) ? 1 : 0;
-        }
-
-    }  // end if thread currently processing an interval
-
-    return count;
-}
-////////////////////////////////////////////////////////////////////////////////
-//! Compute number of eigenvalues that are smaller than x given a symmetric,
-//! real, and tridiagonal matrix
-//! @param  g_d  diagonal elements stored in global memory
-//! @param  g_s  superdiagonal elements stored in global memory
-//! @param  n    size of matrix
-//! @param  x    value for which the number of eigenvalues that are smaller is
-//!              seeked
-//! @param  tid  thread identified (e.g. threadIdx.x or gtid)
-//! @param  num_intervals_active  number of active intervals / threads that
-//!                               currently process an interval
-//! @param  s_d  scratch space to store diagonal entries of the tridiagonal
-//!              matrix in shared memory
-//! @param  s_s  scratch space to store superdiagonal entries of the 
tridiagonal
-//!              matrix in shared memory
-//! @param  converged  flag if the current thread is already converged (that
-//!         is count does not have to be computed)
-////////////////////////////////////////////////////////////////////////////////
-template<typename NumericT>
-__device__
-inline unsigned int
-computeNumSmallerEigenvalsLarge(const NumericT *g_d, const NumericT *g_s, 
const unsigned int n,
-                                const NumericT x,
-                                const unsigned int tid,
-                                const unsigned int num_intervals_active,
-                                NumericT *s_d, NumericT *s_s,
-                                unsigned int converged
-                               )
-{
-    NumericT  delta = 1.0f;
-    unsigned int count = 0;
-
-    unsigned int rem = n;
-
-    // do until whole diagonal and superdiagonal has been loaded and processed
-    for (unsigned int i = 0; i < n; i += blockDim.x)
-    {
-
-        __syncthreads();
-
-        // read new chunk of data into shared memory
-        if ((i + threadIdx.x) < n)
-        {
-
-            s_d[threadIdx.x] = *(g_d + i + threadIdx.x);
-            s_s[threadIdx.x] = *(g_s + i + threadIdx.x - 1);
-        }
-
-        __syncthreads();
-
-
-        if (tid < num_intervals_active)
-        {
-
-            // perform (optimized) Gaussian elimination to determine the number
-            // of eigenvalues that are smaller than n
-            for (unsigned int k = 0; k < min(rem,blockDim.x); ++k)
-            {
-                delta = s_d[k] - x - (s_s[k] * s_s[k]) / delta;
-                // delta = (abs( delta) < (1.0e-10)) ? -(1.0e-10) : delta;
-                count += (delta < 0) ? 1 : 0;
-            }
-
-        }  // end if thread currently processing an interval
-
-        rem -= blockDim.x;
-    }
-
-    return count;
-}
-
-/** @brief Store all non-empty intervals resulting from the subdivision of the 
interval currently processed by the thread.
-*
-* @param  addr                   base address for storing intervals
-* @param  num_threads_active     number of threads / intervals in current sweep
-* @param  s_left                 shared memory storage for left interval limits
-* @param  s_right                shared memory storage for right interval 
limits
-* @param  s_left_count           shared memory storage for number of 
eigenvalues less than left interval limits
-* @param  s_right_count          shared memory storage for number of 
eigenvalues less than right interval limits
-* @param  left                   lower limit of interval
-* @param  mid                    midpoint of interval
-* @param  right                  upper limit of interval
-* @param  left_count             eigenvalues less than \a left
-* @param  mid_count              eigenvalues less than \a mid
-* @param  right_count            eigenvalues less than \a right
-* @param  precision              desired precision for eigenvalues
-* @param  compact_second_chunk   shared mem flag if second chunk is used and 
ergo requires compaction
-* @param  s_compaction_list_exc  helper array for stream compaction, 
s_compaction_list_exc[tid] = 1 when the thread generated two child intervals
-* @param  is_active_second       mark is thread has a second non-empty child 
interval
-*/
-template<class S, class T, class NumericT>
-__device__
-void
-storeNonEmptyIntervals(unsigned int addr,
-                       const unsigned int num_threads_active,
-                       NumericT  *s_left, NumericT *s_right,
-                       T  *s_left_count, T *s_right_count,
-                       NumericT left, NumericT mid, NumericT right,
-                       const S left_count,
-                       const S mid_count,
-                       const S right_count,
-                       NumericT precision,
-                       unsigned int &compact_second_chunk,
-                       T *s_compaction_list_exc,
-                       unsigned int &is_active_second)
-{
-    // check if both child intervals are valid
-
-    if ((left_count != mid_count) && (mid_count != right_count))
-    {
-
-        // store the left interval
-        storeInterval(addr, s_left, s_right, s_left_count, s_right_count,
-                      left, mid, left_count, mid_count, precision);
-
-        // mark that a second interval has been generated, only stored after
-        // stream compaction of second chunk
-        is_active_second = 1;
-        s_compaction_list_exc[threadIdx.x] = 1;
-        compact_second_chunk = 1;
-    }
-    else
-    {
-
-        // only one non-empty child interval
-
-        // mark that no second child
-        is_active_second = 0;
-        s_compaction_list_exc[threadIdx.x] = 0;
-
-        // store the one valid child interval
-        if (left_count != mid_count)
-        {
-            storeInterval(addr, s_left, s_right, s_left_count, s_right_count,
-                          left, mid, left_count, mid_count, precision);
-        }
-        else
-        {
-            storeInterval(addr, s_left, s_right, s_left_count, s_right_count,
-                          mid, right, mid_count, right_count, precision);
-        }
-
-    }
-}
-////////////////////////////////////////////////////////////////////////////////
-//! Create indices for compaction, that is process \a s_compaction_list_exc
-//! which is 1 for intervals that generated a second child and 0 otherwise
-//! and create for each of the non-zero elements the index where the new
-//! interval belongs to in a compact representation of all generated second
-//! childs
-//! @param   s_compaction_list_exc  list containing the flags which threads
-//!                                 generated two childs
-//! @param   num_threads_compaction number of threads to employ for compaction
-////////////////////////////////////////////////////////////////////////////////
-template<class T>
-__device__
-void
-createIndicesCompaction(T *s_compaction_list_exc,
-                        unsigned int num_threads_compaction)
-{
-
-    unsigned int offset = 1;
-    const unsigned int tid = threadIdx.x;
-   // if(tid == 0)
-     // printf("num_threads_compaction = %u\n", num_threads_compaction);
-
-    // higher levels of scan tree
-    for (int d = (num_threads_compaction >> 1); d > 0; d >>= 1)
-    {
-
-        __syncthreads();
-
-        if (tid < d)
-        {
-
-            unsigned int  ai = offset*(2*tid+1)-1;
-            unsigned int  bi = offset*(2*tid+2)-1;
-
-            s_compaction_list_exc[bi] =   s_compaction_list_exc[bi]
-                                          + s_compaction_list_exc[ai];
-        }
-
-        offset <<= 1;
-    }
-
-    // traverse down tree: first down to level 2 across
-    for (int d = 2; d < num_threads_compaction; d <<= 1)
-    {
-
-        offset >>= 1;
-        __syncthreads();
-
-        if (tid < (d-1))
-        {
-
-            unsigned int  ai = offset*(tid+1) - 1;
-            unsigned int  bi = ai + (offset >> 1);
-
-            s_compaction_list_exc[bi] =   s_compaction_list_exc[bi]
-                                          + s_compaction_list_exc[ai];
-        }
-    }
-
-    __syncthreads();
-
-}
-
-/** @brief Perform stream compaction for second child intervals
-*
-* @param  s_left              shared memory storage for left interval limits
-* @param  s_right             shared memory storage for right interval limits
-* @param  s_left_count        shared memory storage for number of eigenvalues 
less than left interval limits
-* @param  s_right_count       shared memory storage for number of eigenvalues 
less than right interval limits
-* @param  mid                 midpoint of current interval (left of new 
interval)
-* @param  right               upper limit of interval
-* @param  mid_count           eigenvalues less than mid
-* @param  right_count         eigenvalues less than right
-* @param  s_compaction_list   list containing the indices where the data has 
to be stored
-* @param  num_threads_active  number of active threads / intervals
-* @param  is_active_second    mark is thread has a second non-empty child 
interval
-*/
-template<class T, class NumericT>
-__device__
-void
-compactIntervals(NumericT *s_left, NumericT *s_right,
-                 T *s_left_count, T *s_right_count,
-                 NumericT mid, NumericT right,
-                 unsigned int mid_count, unsigned int right_count,
-                 T *s_compaction_list,
-                 unsigned int num_threads_active,
-                 unsigned int is_active_second)
-{
-    const unsigned int tid = threadIdx.x;
-
-    // perform compaction / copy data for all threads where the second
-    // child is not dead
-    if ((tid < num_threads_active) && (1 == is_active_second))
-    {
-        unsigned int addr_w = num_threads_active + s_compaction_list[tid];
-        s_left[addr_w] = mid;
-        s_right[addr_w] = right;
-        s_left_count[addr_w] = mid_count;
-        s_right_count[addr_w] = right_count;
-    }
-}
-
-template<class T, class S, class NumericT>
-__device__
-void
-storeIntervalConverged(NumericT *s_left, NumericT *s_right,
-                       T *s_left_count, T *s_right_count,
-                       NumericT &left, NumericT &mid, NumericT &right,
-                       S &left_count, S &mid_count, S &right_count,
-                       T *s_compaction_list_exc,
-                       unsigned int &compact_second_chunk,
-                       const unsigned int num_threads_active,
-                       unsigned int &is_active_second)
-{
-    const unsigned int tid = threadIdx.x;
-    const unsigned int multiplicity = right_count - left_count;
-    // check multiplicity of eigenvalue
-    if (1 == multiplicity)
-    {
-
-        // just re-store intervals, simple eigenvalue
-        s_left[tid] = left;
-        s_right[tid] = right;
-        s_left_count[tid] = left_count;
-        s_right_count[tid] = right_count;
-
-
-        // mark that no second child / clear
-        is_active_second = 0;
-        s_compaction_list_exc[tid] = 0;
-    }
-    else
-    {
-
-        // number of eigenvalues after the split less than mid
-        mid_count = left_count + (multiplicity >> 1);
-
-        // store left interval
-        s_left[tid] = left;
-        s_right[tid] = right;
-        s_left_count[tid] = left_count;
-        s_right_count[tid] = mid_count;
-        mid = left;
-
-        // mark that second child interval exists
-        is_active_second = 1;
-        s_compaction_list_exc[tid] = 1;
-        compact_second_chunk = 1;
-    }
-}
-
-/** @brief Subdivide interval if active and not already converged.
-*
-* @param  tid                    id of thread
-* @param  s_left                 shared memory storage for left interval limits
-* @param  s_right                shared memory storage for right interval 
limits
-* @param  s_left_count           shared memory storage for number of 
eigenvalues less than left interval limits
-* @param  s_right_count          shared memory storage for number of 
eigenvalues less than right interval limits
-* @param  num_threads_active     number of active threads in warp
-* @param  left                   lower limit of interval
-* @param  right                  upper limit of interval
-* @param  left_count             eigenvalues less than \a left
-* @param  right_count            eigenvalues less than \a right
-* @param  mid                    median of interval
-* @param  all_threads_converged  shared memory flag if all threads are
-*/
-template<class T, class NumericT>
-__device__
-void
-subdivideActiveIntervalMulti(const unsigned int tid,
-                        NumericT *s_left, NumericT *s_right,
-                        T *s_left_count, T *s_right_count,
-                        const unsigned int num_threads_active,
-                        NumericT &left, NumericT &right,
-                        unsigned int &left_count, unsigned int &right_count,
-                        NumericT &mid, unsigned int &all_threads_converged)
-{
-  // for all active threads
-  if (tid < num_threads_active)
-  {
-
-    left = s_left[tid];
-    right = s_right[tid];
-    left_count = s_left_count[tid];
-    right_count = s_right_count[tid];
-
-    // check if thread already converged
-    if (left != right)
-    {
-
-      mid = computeMidpoint(left, right);
-      all_threads_converged = 0;
-    }
-    else if ((right_count - left_count) > 1)
-    {
-      // mark as not converged if multiple eigenvalues enclosed
-      // duplicate interval in storeIntervalsConverged()
-      all_threads_converged = 0;
-    }
-
-  }  // end for all active threads
-}
-
-
-/** @brief Subdivide interval if active and not already converged.
-*
-* @param  tid                    id of thread
-* @param  s_left                 shared memory storage for left interval limits
-* @param  s_right                shared memory storage for right interval 
limits
-* @param  s_left_count           shared memory storage for number of 
eigenvalues less than left interval limits
-* @param  s_right_count          shared memory storage for number of 
eigenvalues less than right interval limits
-* @param  num_threads_active     number of active threads in warp
-* @param  left                   lower limit of interval
-* @param  right                  upper limit of interval
-* @param  left_count             eigenvalues less than \a left
-* @param  right_count            eigenvalues less than \a right
-* @param  mid                    median of interval
-* @param  all_threads_converged  shared memory flag if all threads are
-*/
-template<class T, class NumericT>
-__device__
-void
-subdivideActiveInterval(const unsigned int tid,
-                        NumericT *s_left, NumericT *s_right,
-                        T *s_left_count, T *s_right_count,
-                        const unsigned int num_threads_active,
-                        NumericT &left, NumericT &right,
-                        unsigned int &left_count, unsigned int &right_count,
-                        NumericT &mid, unsigned int &all_threads_converged)
-{
-  // for all active threads
-  if (tid < num_threads_active)
-  {
-
-    left = s_left[tid];
-    right = s_right[tid];
-    left_count = s_left_count[tid];
-    right_count = s_right_count[tid];
-
-    // check if thread already converged
-    if (left != right)
-    {
-
-      mid = computeMidpoint(left, right);
-      all_threads_converged = 0;
-    }
-  }  // end for all active threads
-}
-}
-}
-}
-
-#endif // #ifndef VIENNACL_LINALG_DETAIL_BISECT_UTIL_HPP_
-

http://git-wip-us.apache.org/repos/asf/mahout/blob/7ae549fa/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/common.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/common.hpp 
b/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/common.hpp
deleted file mode 100644
index 3622b89..0000000
--- a/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/common.hpp
+++ /dev/null
@@ -1,250 +0,0 @@
-#ifndef VIENNACL_LINALG_CUDA_COMMON_HPP_
-#define VIENNACL_LINALG_CUDA_COMMON_HPP_
-
-/* =========================================================================
-   Copyright (c) 2010-2016, Institute for Microelectronics,
-                            Institute for Analysis and Scientific Computing,
-                            TU Wien.
-   Portions of this software are copyright by UChicago Argonne, LLC.
-
-                            -----------------
-                  ViennaCL - The Vienna Computing Library
-                            -----------------
-
-   Project Head:    Karl Rupp                   [email protected]
-
-   (A list of authors and contributors can be found in the manual)
-
-   License:         MIT (X11), see file LICENSE in the base directory
-============================================================================= 
*/
-
-/** @file viennacl/linalg/cuda/common.hpp
-    @brief Common routines for CUDA execution
-*/
-
-#include <sstream>
-#include <cuda_runtime.h>
-#include "viennacl/backend/cuda.hpp"
-#include "viennacl/traits/handle.hpp"
-
-#define VIENNACL_CUDA_LAST_ERROR_CHECK(message)  detail::cuda_last_error_check 
(message, __FILE__, __LINE__)
-
-namespace viennacl
-{
-
-////// scalar
-
-/** @brief Convenience helper function for extracting the CUDA handle from a 
ViennaCL scalar. Non-const version. */
-template<typename NumericT>
-NumericT * cuda_arg(scalar<NumericT> & obj)
-{
-  return reinterpret_cast<NumericT 
*>(viennacl::traits::handle(obj).cuda_handle().get());
-}
-
-/** @brief Convenience helper function for extracting the CUDA handle from a 
ViennaCL scalar. Const version. */
-template<typename NumericT>
-const NumericT * cuda_arg(scalar<NumericT> const & obj)
-{
-  return reinterpret_cast<const NumericT 
*>(viennacl::traits::handle(obj).cuda_handle().get());
-}
-
-
-////// vector_base
-
-/** @brief Convenience helper function for extracting the CUDA handle from a 
ViennaCL vector (through the base class vector_base) with implicit return type 
deduction. Non-const version. */
-template<typename NumericT>
-NumericT * cuda_arg(vector_base<NumericT> & obj)
-{
-  return reinterpret_cast<NumericT 
*>(viennacl::traits::handle(obj).cuda_handle().get());
-}
-
-/** @brief Convenience helper function for extracting the CUDA handle from a 
ViennaCL vector (through the base class vector_base) with implicit return type 
deduction. Const version. */
-template<typename NumericT>
-const NumericT * cuda_arg(vector_base<NumericT> const & obj)
-{
-  return reinterpret_cast<const NumericT 
*>(viennacl::traits::handle(obj).cuda_handle().get());
-}
-
-/** @brief Convenience helper function for extracting the CUDA handle from a 
ViennaCL vector (through the base class vector_base). Return type needs to be 
explicitly provided as first template argument. Non-const version. */
-template<typename ReturnT, typename NumericT>
-ReturnT * cuda_arg(vector_base<NumericT> & obj)
-{
-  return reinterpret_cast<ReturnT 
*>(viennacl::traits::handle(obj).cuda_handle().get());
-}
-
-/** @brief Convenience helper function for extracting the CUDA handle from a 
ViennaCL vector (through the base class vector_base). Return type needs to be 
explicitly provided as first template argument. Const version. */
-template<typename ReturnT, typename NumericT>
-const ReturnT * cuda_arg(vector_base<NumericT> const & obj)
-{
-  return reinterpret_cast<const ReturnT 
*>(viennacl::traits::handle(obj).cuda_handle().get());
-}
-
-
-////// matrix_base
-
-/** @brief Convenience helper function for extracting the CUDA handle from a 
ViennaCL matrix (through the base class matrix_base). Non-const version. */
-template<typename NumericT>
-NumericT * cuda_arg(matrix_base<NumericT> & obj)
-{
-  return reinterpret_cast<NumericT 
*>(viennacl::traits::handle(obj).cuda_handle().get());
-}
-
-/** @brief Convenience helper function for extracting the CUDA handle from a 
ViennaCL matrix (through the base class matrix_base). Const version. */
-template<typename NumericT>
-const NumericT * cuda_arg(matrix_base<NumericT> const & obj)
-{
-  return reinterpret_cast<const NumericT 
*>(viennacl::traits::handle(obj).cuda_handle().get());
-}
-
-
-
-////// mem_handle
-
-
-/** @brief Convenience helper function for extracting the CUDA handle from a 
generic memory handle. Non-const version. */
-template<typename ReturnT>
-ReturnT * cuda_arg(viennacl::backend::mem_handle & h)
-{
-  return reinterpret_cast<ReturnT *>(h.cuda_handle().get());
-}
-
-/** @brief Convenience helper function for extracting the CUDA handle from a 
generic memory handle. Const-version. */
-template<typename ReturnT>
-ReturnT const * cuda_arg(viennacl::backend::mem_handle const & h)
-{
-  return reinterpret_cast<const ReturnT *>(h.cuda_handle().get());
-}
-
-/** \cond **/
-template<typename ReturnT>
-ReturnT * cuda_arg(viennacl::backend::mem_handle::cuda_handle_type & h)
-{
-  return reinterpret_cast<ReturnT *>(h.get());
-}
-
-template<typename ReturnT>
-ReturnT const *  cuda_arg(viennacl::backend::mem_handle::cuda_handle_type 
const & h)
-{
-  return reinterpret_cast<const ReturnT *>(h.get());
-}
-
-inline unsigned int cuda_arg(unsigned int val)  { return val; }
-
-template<typename NumericT> char           cuda_arg(char val)           { 
return val; }
-template<typename NumericT> unsigned char  cuda_arg(unsigned char val)  { 
return val; }
-
-template<typename NumericT> short          cuda_arg(short val)          { 
return val; }
-template<typename NumericT> unsigned short cuda_arg(unsigned short val) { 
return val; }
-
-template<typename NumericT> int            cuda_arg(int val)            { 
return val; }
-template<typename NumericT> unsigned int   cuda_arg(unsigned int val)   { 
return val; }
-
-template<typename NumericT> long           cuda_arg(long val)           { 
return val; }
-template<typename NumericT> unsigned long  cuda_arg(unsigned long val)  { 
return val; }
-
-template<typename NumericT> float          cuda_arg(float val)          { 
return val; }
-template<typename NumericT> double         cuda_arg(double val)         { 
return val; }
-
-/** \endcond */
-
-
-namespace linalg
-{
-namespace cuda
-{
-
-
-namespace detail
-{
-
-inline unsigned int make_options(vcl_size_t length, bool reciprocal, bool 
flip_sign)
-{
-  return static_cast<unsigned int>( ((length > 1) ? (static_cast<unsigned 
int>(length) << 2) : 0) + (reciprocal ? 2 : 0) + (flip_sign ? 1 : 0) );
-}
-
-inline void cuda_last_error_check(const char * message, const char * file, 
const int line )
-{
-  cudaError_t error_code = cudaGetLastError();
-
-  if (cudaSuccess != error_code)
-  {
-    std::stringstream ss;
-    ss << file << "(" << line << "): " << ": getLastCudaError() CUDA error " 
<< error_code << ": " << cudaGetErrorString( error_code ) << " @ " << message 
<< std::endl;
-    throw viennacl::backend::cuda::cuda_exception(ss.str(), error_code);
-  }
-}
-
-template<typename NumericT>
-struct type_to_type2;
-
-template<>
-struct type_to_type2<float> { typedef float2  type; };
-
-template<>
-struct type_to_type2<double> { typedef double2  type; };
-
-
-template<typename NumericT, typename OtherT>
-typename viennacl::backend::mem_handle::cuda_handle_type & 
arg_reference(viennacl::scalar<NumericT> & s, OtherT) { return 
s.handle().cuda_handle(); }
-
-template<typename NumericT, typename OtherT>
-typename viennacl::backend::mem_handle::cuda_handle_type const & 
arg_reference(viennacl::scalar<NumericT> const & s, OtherT) { return 
s.handle().cuda_handle(); }
-
-// all other cases where T is not a ViennaCL scalar
-template<typename ArgT>
-typename viennacl::enable_if< viennacl::is_cpu_scalar<ArgT>::value,
-                              char const &>::type
-arg_reference(ArgT, char const & val)  { return val; }
-
-template<typename ArgT>
-typename viennacl::enable_if< viennacl::is_cpu_scalar<ArgT>::value,
-                              unsigned char const &>::type
-arg_reference(ArgT, unsigned char const & val)  { return val; }
-
-template<typename ArgT>
-typename viennacl::enable_if< viennacl::is_cpu_scalar<ArgT>::value,
-                              short const &>::type
-arg_reference(ArgT, short const & val)  { return val; }
-
-template<typename ArgT>
-typename viennacl::enable_if< viennacl::is_cpu_scalar<ArgT>::value,
-                              unsigned short const &>::type
-arg_reference(ArgT, unsigned short const & val)  { return val; }
-
-template<typename ArgT>
-typename viennacl::enable_if< viennacl::is_cpu_scalar<ArgT>::value,
-                              int const &>::type
-arg_reference(ArgT, int const & val)  { return val; }
-
-template<typename ArgT>
-typename viennacl::enable_if< viennacl::is_cpu_scalar<ArgT>::value,
-                              unsigned int const &>::type
-arg_reference(ArgT, unsigned int const & val)  { return val; }
-
-template<typename ArgT>
-typename viennacl::enable_if< viennacl::is_cpu_scalar<ArgT>::value,
-                              long const &>::type
-arg_reference(ArgT, long const & val)  { return val; }
-
-template<typename ArgT>
-typename viennacl::enable_if< viennacl::is_cpu_scalar<ArgT>::value,
-                              unsigned long const &>::type
-arg_reference(ArgT, unsigned long const & val)  { return val; }
-
-template<typename ArgT>
-typename viennacl::enable_if< viennacl::is_cpu_scalar<ArgT>::value,
-                              float const &>::type
-arg_reference(ArgT, float const & val)  { return val; }
-
-template<typename ArgT>
-typename viennacl::enable_if< viennacl::is_cpu_scalar<ArgT>::value,
-                              double const &>::type
-arg_reference(ArgT, double const & val)  { return val; }
-
-} //namespace detail
-} //namespace cuda
-} //namespace linalg
-} //namespace viennacl
-
-
-#endif

http://git-wip-us.apache.org/repos/asf/mahout/blob/7ae549fa/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/direct_solve.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/direct_solve.hpp 
b/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/direct_solve.hpp
deleted file mode 100644
index ae70f9a..0000000
--- a/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/direct_solve.hpp
+++ /dev/null
@@ -1,412 +0,0 @@
-#ifndef VIENNACL_LINALG_CUDA_DIRECT_SOLVE_HPP
-#define VIENNACL_LINALG_CUDA_DIRECT_SOLVE_HPP
-
-/* =========================================================================
-   Copyright (c) 2010-2016, Institute for Microelectronics,
-                            Institute for Analysis and Scientific Computing,
-                            TU Wien.
-   Portions of this software are copyright by UChicago Argonne, LLC.
-
-                            -----------------
-                  ViennaCL - The Vienna Computing Library
-                            -----------------
-
-   Project Head:    Karl Rupp                   [email protected]
-
-   (A list of authors and contributors can be found in the manual)
-
-   License:         MIT (X11), see file LICENSE in the base directory
-============================================================================= 
*/
-
-/** @file viennacl/linalg/cuda/direct_solve.hpp
-    @brief Implementations of dense direct solvers using CUDA are found here.
-*/
-
-#include "viennacl/forwards.h"
-#include "viennacl/vector.hpp"
-#include "viennacl/matrix.hpp"
-
-
-#include "viennacl/linalg/cuda/common.hpp"
-
-
-namespace viennacl
-{
-namespace linalg
-{
-namespace cuda
-{
-
-template<typename NumericT>
-__global__ void matrix_matrix_upper_solve_kernel(
-          const NumericT * A,
-          unsigned int A_start1, unsigned int A_start2,
-          unsigned int A_inc1,   unsigned int A_inc2,
-          unsigned int A_size1,  unsigned int A_size2,
-          unsigned int A_internal_size1, unsigned int A_internal_size2,
-          bool row_major_A,
-
-          NumericT * B,
-          unsigned int B_start1, unsigned int B_start2,
-          unsigned int B_inc1,   unsigned int B_inc2,
-          unsigned int B_size1,  unsigned int B_size2,
-          unsigned int B_internal_size1, unsigned int B_internal_size2,
-          bool row_major_B,
-
-          bool unit_diagonal)
-{
-  NumericT temp;
-  NumericT entry_A;
-
-  for (unsigned int row_cnt = 0; row_cnt < A_size1; ++row_cnt)
-  {
-    unsigned int row = A_size1 - 1 - row_cnt;
-
-    if (!unit_diagonal)
-    {
-      __syncthreads();
-
-      if (threadIdx.x == 0)
-      {
-        if (row_major_B)
-          B[(row * B_inc1 + B_start1) * B_internal_size2 + (blockIdx.x * 
B_inc2 + B_start2)] /= (row_major_A) ? A[(row * A_inc1 + A_start1) * 
A_internal_size2 + (row * A_inc2 + A_start2)]
-                                                                               
                               : A[(row * A_inc1 + A_start1) + (row * A_inc2 + 
A_start2)*A_internal_size1];
-        else //if (!row_major_B)
-          B[(row * B_inc1 + B_start1) + (blockIdx.x * B_inc2 + B_start2) * 
B_internal_size1] /= (row_major_A) ? A[(row * A_inc1 + A_start1) * 
A_internal_size2 + (row * A_inc2 + A_start2)]
-                                                                               
                               : A[(row * A_inc1 + A_start1) + (row * A_inc2 + 
A_start2)*A_internal_size1];
-      }
-    }
-
-    __syncthreads();
-
-    if (row_major_B)
-      temp = B[(row * B_inc1 + B_start1) * B_internal_size2 + (blockIdx.x * 
B_inc2 + B_start2)];
-    else //if (!row_major_B)
-      temp = B[(row * B_inc1 + B_start1) + (blockIdx.x * B_inc2 + B_start2) * 
B_internal_size1];
-
-    //eliminate column of op(A) with index 'row' in parallel: " << std::endl;
-    for  (unsigned int elim = threadIdx.x; elim < row; elim += blockDim.x)
-    {
-      if (row_major_A)
-        entry_A = A[(elim * A_inc1 + A_start1) * A_internal_size2 + (row * 
A_inc2 + A_start2)];
-      else //if (!row_major_A)
-        entry_A = A[(elim * A_inc1 + A_start1) + (row * A_inc2 + A_start2) * 
A_internal_size1];
-
-      if (row_major_B)
-        B[(elim * B_inc1 + B_start1) * B_internal_size2 + (blockIdx.x * B_inc2 
+ B_start2)] -= temp * entry_A;
-      else //if (!row_major_B)
-        B[(elim * B_inc1 + B_start1) + (blockIdx.x * B_inc2 + B_start2) * 
B_internal_size1] -= temp * entry_A;
-
-    }
-  }
-}
-
-
-
-template<typename NumericT>
-__global__ void matrix_matrix_lower_solve_kernel(
-          const NumericT * A,
-          unsigned int A_start1, unsigned int A_start2,
-          unsigned int A_inc1,   unsigned int A_inc2,
-          unsigned int A_size1,  unsigned int A_size2,
-          unsigned int A_internal_size1, unsigned int A_internal_size2,
-          bool row_major_A,
-
-          NumericT * B,
-          unsigned int B_start1, unsigned int B_start2,
-          unsigned int B_inc1,   unsigned int B_inc2,
-          unsigned int B_size1,  unsigned int B_size2,
-          unsigned int B_internal_size1, unsigned int B_internal_size2,
-          bool row_major_B,
-
-          bool unit_diagonal)
-{
-  NumericT temp;
-  NumericT entry_A;
-
-  for (unsigned int row = 0; row < A_size1; ++row)
-  {
-
-    if (!unit_diagonal)
-    {
-      __syncthreads();
-
-      if (threadIdx.x == 0)
-      {
-        if (row_major_B)
-          B[(row * B_inc1 + B_start1) * B_internal_size2 + (blockIdx.x * 
B_inc2 + B_start2)] /= (row_major_A) ? A[(row * A_inc1 + A_start1) * 
A_internal_size2 + (row * A_inc2 + A_start2)]
-                                                                               
                               : A[(row * A_inc1 + A_start1) + (row * A_inc2 + 
A_start2)*A_internal_size1];
-        else //if (!row_major_B)
-          B[(row * B_inc1 + B_start1) + (blockIdx.x * B_inc2 + B_start2) * 
B_internal_size1] /= (row_major_A) ? A[(row * A_inc1 + A_start1) * 
A_internal_size2 + (row * A_inc2 + A_start2)]
-                                                                               
                               : A[(row * A_inc1 + A_start1) + (row * A_inc2 + 
A_start2)*A_internal_size1];
-      }
-    }
-
-    __syncthreads();
-
-    if (row_major_B)
-      temp = B[(row * B_inc1 + B_start1) * B_internal_size2 + (blockIdx.x * 
B_inc2 + B_start2)];
-    else //if (!row_major_B)
-      temp = B[(row * B_inc1 + B_start1) + (blockIdx.x * B_inc2 + B_start2) * 
B_internal_size1];
-
-    //eliminate column of op(A) with index 'row' in parallel: " << std::endl;
-    for  (unsigned int elim = row + threadIdx.x + 1; elim < A_size1; elim += 
blockDim.x)
-    {
-      if (row_major_A)
-        entry_A = A[(elim * A_inc1 + A_start1) * A_internal_size2 + (row * 
A_inc2 + A_start2)];
-      else //if (!row_major_A)
-        entry_A = A[(elim * A_inc1 + A_start1) + (row * A_inc2 + A_start2) * 
A_internal_size1];
-
-      if (row_major_B)
-        B[(elim * B_inc1 + B_start1) * B_internal_size2 + (blockIdx.x * B_inc2 
+ B_start2)] -= temp * entry_A;
-      else //if (!row_major_B)
-        B[(elim * B_inc1 + B_start1) + (blockIdx.x * B_inc2 + B_start2) * 
B_internal_size1] -= temp * entry_A;
-
-    }
-  }
-}
-
-
-
-
-
-
-namespace detail
-{
-  template<typename TagT>
-  bool is_unit_solve(TagT const & tag) { return false; }
-
-  inline bool is_unit_solve(viennacl::linalg::unit_lower_tag) { return true; }
-  inline bool is_unit_solve(viennacl::linalg::unit_upper_tag) { return true; }
-
-  template<typename TagT>
-  bool is_upper_solve(TagT const & tag) { return false; }
-
-  inline bool is_upper_solve(viennacl::linalg::upper_tag) { return true; }
-  inline bool is_upper_solve(viennacl::linalg::unit_upper_tag) { return true; }
-
-  template<typename Matrix1T, typename Matrix2T, typename SolverTagT>
-  void inplace_solve_impl(Matrix1T const & A,
-                          Matrix2T & B,
-                          SolverTagT const & tag)
-  {
-    typedef typename viennacl::result_of::cpu_value_type<Matrix1T>::type       
 value_type;
-
-    dim3 threads(128);
-    dim3 grid(B.size2());
-
-    if (is_upper_solve(tag))
-    {
-      matrix_matrix_upper_solve_kernel<<<grid,threads>>>(viennacl::cuda_arg(A),
-                                                         static_cast<unsigned 
int>(viennacl::traits::start1(A)),         static_cast<unsigned 
int>(viennacl::traits::start2(A)),
-                                                         static_cast<unsigned 
int>(viennacl::traits::stride1(A)),        static_cast<unsigned 
int>(viennacl::traits::stride2(A)),
-                                                         static_cast<unsigned 
int>(viennacl::traits::size1(A)),          static_cast<unsigned 
int>(viennacl::traits::size2(A)),
-                                                         static_cast<unsigned 
int>(viennacl::traits::internal_size1(A)), static_cast<unsigned 
int>(viennacl::traits::internal_size2(A)),
-                                                         bool(A.row_major()),
-
-                                                         viennacl::cuda_arg(B),
-                                                         static_cast<unsigned 
int>(viennacl::traits::start1(B)),         static_cast<unsigned 
int>(viennacl::traits::start2(B)),
-                                                         static_cast<unsigned 
int>(viennacl::traits::stride1(B)),        static_cast<unsigned 
int>(viennacl::traits::stride2(B)),
-                                                         static_cast<unsigned 
int>(viennacl::traits::size1(B)),          static_cast<unsigned 
int>(viennacl::traits::size2(B)),
-                                                         static_cast<unsigned 
int>(viennacl::traits::internal_size1(B)), static_cast<unsigned 
int>(viennacl::traits::internal_size2(B)),
-                                                         bool(B.row_major()),
-
-                                                         is_unit_solve(tag)
-                                                        );
-    }
-    else
-    {
-      matrix_matrix_lower_solve_kernel<<<grid,threads>>>(viennacl::cuda_arg(A),
-                                                         static_cast<unsigned 
int>(viennacl::traits::start1(A)),         static_cast<unsigned 
int>(viennacl::traits::start2(A)),
-                                                         static_cast<unsigned 
int>(viennacl::traits::stride1(A)),        static_cast<unsigned 
int>(viennacl::traits::stride2(A)),
-                                                         static_cast<unsigned 
int>(viennacl::traits::size1(A)),          static_cast<unsigned 
int>(viennacl::traits::size2(A)),
-                                                         static_cast<unsigned 
int>(viennacl::traits::internal_size1(A)), static_cast<unsigned 
int>(viennacl::traits::internal_size2(A)),
-                                                         bool(A.row_major()),
-
-                                                         viennacl::cuda_arg(B),
-                                                         static_cast<unsigned 
int>(viennacl::traits::start1(B)),         static_cast<unsigned 
int>(viennacl::traits::start2(B)),
-                                                         static_cast<unsigned 
int>(viennacl::traits::stride1(B)),        static_cast<unsigned 
int>(viennacl::traits::stride2(B)),
-                                                         static_cast<unsigned 
int>(viennacl::traits::size1(B)),          static_cast<unsigned 
int>(viennacl::traits::size2(B)),
-                                                         static_cast<unsigned 
int>(viennacl::traits::internal_size1(B)), static_cast<unsigned 
int>(viennacl::traits::internal_size2(B)),
-                                                         bool(B.row_major()),
-
-                                                         is_unit_solve(tag)
-                                                        );
-    }
-
-  }
-}
-
-
-//
-// Note: By convention, all size checks are performed in the calling frontend. 
No need to double-check here.
-//
-
-////////////////// triangular solver //////////////////////////////////////
-/** @brief Direct inplace solver for triangular systems with multiple right 
hand sides, i.e. A \ B   (MATLAB notation).
-*
-* @param A         The system matrix
-* @param B         The matrix of row vectors, where the solution is directly 
written to
-* @param tag       Solver tag for identifying the respective triangular solver
-*/
-template<typename NumericT, typename SolverTagT>
-void inplace_solve(matrix_base<NumericT> const & A,
-                   matrix_base<NumericT> & B,
-                   SolverTagT tag)
-{
-  detail::inplace_solve_impl(A, B, tag);
-}
-
-
-//
-//  Solve on vector
-//
-
-template<typename NumericT>
-__global__ void triangular_substitute_inplace_row_kernel(
-          NumericT const * A,
-          unsigned int A_start1, unsigned int A_start2,
-          unsigned int A_inc1,   unsigned int A_inc2,
-          unsigned int A_size1,  unsigned int A_size2,
-          unsigned int A_internal_size1,  unsigned int A_internal_size2,
-          NumericT * v,
-          unsigned int v_start,
-          unsigned int v_inc,
-          unsigned int v_size,
-
-          unsigned int options)
-{
-  NumericT temp;
-  unsigned int unit_diagonal_flag  = (options & (1 << 0));
-
-  unsigned int is_lower_solve      = (options & (1 << 2));
-  unsigned int row;
-  for (unsigned int rows_processed = 0; rows_processed < A_size1; 
++rows_processed)    //Note: A required to be square
-  {
-    row = is_lower_solve ? rows_processed : ((A_size1 - rows_processed) - 1);
-    if (!unit_diagonal_flag)
-    {
-      __syncthreads();
-      if (threadIdx.x == 0)
-        v[row * v_inc + v_start] /= A[(row * A_inc1 + A_start1) * 
A_internal_size2 + (row * A_inc2 + A_start2)];
-    }
-
-    __syncthreads();
-
-    temp = v[row * v_inc + v_start];
-
-    for (int elim = (is_lower_solve ? (row + threadIdx.x + 1) : threadIdx.x);
-            elim < (is_lower_solve ? A_size1 : row);
-            elim += blockDim.x)
-      v[elim * v_inc + v_start] -= temp * A[(elim * A_inc1 + A_start1) * 
A_internal_size2 + (row  * A_inc2 + A_start2)];
-  }
-}
-
-
-template<typename NumericT>
-__global__ void triangular_substitute_inplace_col_kernel(
-          NumericT const * A,
-          unsigned int A_start1, unsigned int A_start2,
-          unsigned int A_inc1,   unsigned int A_inc2,
-          unsigned int A_size1,  unsigned int A_size2,
-          unsigned int A_internal_size1,  unsigned int A_internal_size2,
-          NumericT * v,
-          unsigned int v_start,
-          unsigned int v_inc,
-          unsigned int v_size,
-          unsigned int options)
-{
-  NumericT temp;
-  unsigned int unit_diagonal_flag  = (options & (1 << 0));
-
-  unsigned int is_lower_solve      = (options & (1 << 2));
-  unsigned int row;
-  for (unsigned int rows_processed = 0; rows_processed < A_size1; 
++rows_processed)    //Note: A required to be square
-  {
-    row = is_lower_solve ? rows_processed : ((A_size1 - rows_processed) - 1);
-    if (!unit_diagonal_flag)
-    {
-      __syncthreads();
-      if (threadIdx.x == 0)
-        v[row * v_inc + v_start] /= A[(row * A_inc1 + A_start1) + (row * 
A_inc2 + A_start2) * A_internal_size1];
-    }
-
-    __syncthreads();
-
-    temp = v[row * v_inc + v_start];
-
-    for (int elim = (is_lower_solve ? (row + threadIdx.x + 1) : threadIdx.x);
-            elim < (is_lower_solve ? A_size1 : row);
-            elim += blockDim.x)
-      v[elim * v_inc + v_start] -= temp * A[(elim * A_inc1 + A_start1) + (row  
* A_inc2 + A_start2) * A_internal_size1];
-  }
-}
-
-
-namespace detail
-{
-  inline unsigned int get_option_for_solver_tag(viennacl::linalg::upper_tag)   
   { return 0; }
-  inline unsigned int 
get_option_for_solver_tag(viennacl::linalg::unit_upper_tag) { return (1 << 0); }
-  inline unsigned int get_option_for_solver_tag(viennacl::linalg::lower_tag)   
   { return (1 << 2); }
-  inline unsigned int 
get_option_for_solver_tag(viennacl::linalg::unit_lower_tag) { return (1 << 2) | 
(1 << 0); }
-
-  template<typename MatrixT, typename VectorT>
-  void inplace_solve_vector_impl(MatrixT const & mat,
-                                 VectorT & vec,
-                                 unsigned int options)
-  {
-    typedef typename viennacl::result_of::cpu_value_type<MatrixT>::type        
value_type;
-
-    if (mat.row_major())
-    {
-      triangular_substitute_inplace_row_kernel<<<1, 
128>>>(viennacl::cuda_arg(mat),
-                                                           
static_cast<unsigned int>(viennacl::traits::start1(mat)),         
static_cast<unsigned int>(viennacl::traits::start2(mat)),
-                                                           
static_cast<unsigned int>(viennacl::traits::stride1(mat)),        
static_cast<unsigned int>(viennacl::traits::stride2(mat)),
-                                                           
static_cast<unsigned int>(viennacl::traits::size1(mat)),          
static_cast<unsigned int>(viennacl::traits::size2(mat)),
-                                                           
static_cast<unsigned int>(viennacl::traits::internal_size1(mat)), 
static_cast<unsigned int>(viennacl::traits::internal_size2(mat)),
-                                                           
viennacl::cuda_arg(vec),
-                                                           
static_cast<unsigned int>(viennacl::traits::start(vec)),
-                                                           
static_cast<unsigned int>(viennacl::traits::stride(vec)),
-                                                           
static_cast<unsigned int>(viennacl::traits::size(vec)),
-                                                           options
-                                                          );
-    }
-    else
-    {
-      triangular_substitute_inplace_col_kernel<<<1, 
128>>>(viennacl::cuda_arg(mat),
-                                                           
static_cast<unsigned int>(viennacl::traits::start1(mat)),         
static_cast<unsigned int>(viennacl::traits::start2(mat)),
-                                                           
static_cast<unsigned int>(viennacl::traits::stride1(mat)),        
static_cast<unsigned int>(viennacl::traits::stride2(mat)),
-                                                           
static_cast<unsigned int>(viennacl::traits::size1(mat)),          
static_cast<unsigned int>(viennacl::traits::size2(mat)),
-                                                           
static_cast<unsigned int>(viennacl::traits::internal_size1(mat)), 
static_cast<unsigned int>(viennacl::traits::internal_size2(mat)),
-                                                           
viennacl::cuda_arg(vec),
-                                                           
static_cast<unsigned int>(viennacl::traits::start(vec)),
-                                                           
static_cast<unsigned int>(viennacl::traits::stride(vec)),
-                                                           
static_cast<unsigned int>(viennacl::traits::size(vec)),
-                                                           options
-                                                          );
-    }
-  }
-
-}
-
-/** @brief Direct inplace solver for dense triangular systems (non-transposed 
version)
-*
-* @param mat       The system matrix proxy
-* @param vec       The load vector, where the solution is directly written to
-*/
-template<typename NumericT, typename SolverTagT>
-void inplace_solve(matrix_base<NumericT> const & mat,
-                   vector_base<NumericT> & vec,
-                   SolverTagT)
-{
-  unsigned int options = detail::get_option_for_solver_tag(SolverTagT());
-
-  detail::inplace_solve_vector_impl(mat, vec, options);
-}
-
-
-}
-}
-}
-
-#endif

Reply via email to