http://git-wip-us.apache.org/repos/asf/mahout/blob/7ae549fa/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/fft_operations.hpp
----------------------------------------------------------------------
diff --git 
a/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/fft_operations.hpp 
b/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/fft_operations.hpp
deleted file mode 100644
index 198ac31..0000000
--- a/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/fft_operations.hpp
+++ /dev/null
@@ -1,858 +0,0 @@
-#ifndef VIENNACL_LINALG_CUDA_FFT_OPERATIONS_HPP_
-#define VIENNACL_LINALG_CUDA_FFT_OPERATIONS_HPP_
-
-/* =========================================================================
-  Copyright (c) 2010-2016, Institute for Microelectronics,
-                            Institute for Analysis and Scientific Computing,
-                            TU Wien.
-   Portions of this software are copyright by UChicago Argonne, LLC.
-
-                            -----------------
-                  ViennaCL - The Vienna Computing Library
-                            -----------------
-
-   Project Head:    Karl Rupp                   [email protected]
-
-   (A list of authors and contributors can be found in the manual)
-
-   License:         MIT (X11), see file LICENSE in the base directory
-============================================================================= 
*/
-
-/** @file viennacl/linalg/cuda/fft_operations.hpp
-    @brief Implementations of Fast Furier Transformation using cuda
-*/
-#include <cmath>
-#include <viennacl/matrix.hpp>
-#include <viennacl/vector.hpp>
-
-#include "viennacl/forwards.h"
-#include "viennacl/scalar.hpp"
-#include "viennacl/tools/tools.hpp"
-#include "viennacl/linalg/cuda/common.hpp"
-#include "viennacl/linalg/host_based/vector_operations.hpp"
-#include "viennacl/linalg/host_based/fft_operations.hpp"
-
-namespace viennacl
-{
-namespace linalg
-{
-namespace cuda
-{
-namespace detail
-{
-  namespace fft
-  {
-    const vcl_size_t MAX_LOCAL_POINTS_NUM = 512;
-
-    inline vcl_size_t num_bits(vcl_size_t size)
-    {
-      vcl_size_t bits_datasize = 0;
-      vcl_size_t ds = 1;
-
-      while (ds < size)
-      {
-        ds = ds << 1;
-        bits_datasize++;
-      }
-
-      return bits_datasize;
-    }
-
-    inline vcl_size_t next_power_2(vcl_size_t n)
-    {
-      n = n - 1;
-
-      vcl_size_t power = 1;
-
-      while (power < sizeof(vcl_size_t) * 8)
-      {
-        n = n | (n >> power);
-        power *= 2;
-      }
-
-      return n + 1;
-    }
-
-  } //namespace fft
-} //namespace detail
-
-// addition
-inline __host__ __device__ float2 operator+(float2 a, float2 b)
-{
-  return make_float2(a.x + b.x, a.y + b.y);
-}
-
-// subtract
-inline __host__ __device__ float2 operator-(float2 a, float2 b)
-{
-  return make_float2(a.x - b.x, a.y - b.y);
-}
-// division
-template<typename SCALARTYPE>
-inline __device__ float2 operator/(float2 a,SCALARTYPE b)
-{
-  return make_float2(a.x/b, a.y/b);
-}
-
-//multiplication
-inline __device__ float2 operator*(float2 in1, float2 in2)
-{
-  return make_float2(in1.x * in2.x - in1.y * in2.y, in1.x * in2.y + in1.y * 
in2.x);
-}
-
-// addition
-inline __host__ __device__ double2 operator+(double2 a, double2 b)
-{
-  return make_double2(a.x + b.x, a.y + b.y);
-}
-
-// subtraction
-inline __host__ __device__ double2 operator-(double2 a, double2 b)
-{
-  return make_double2(a.x - b.x, a.y - b.y);
-}
-
-// division
-template<typename SCALARTYPE>
-inline __host__ __device__ double2 operator/(double2 a,SCALARTYPE b)
-{
-  return make_double2(a.x/b, a.y/b);
-}
-
-//multiplication
-inline __host__ __device__ double2 operator*(double2 in1, double2 in2)
-{
-  return make_double2(in1.x * in2.x - in1.y * in2.y, in1.x * in2.y + in1.y * 
in2.x);
-}
-
-inline __device__ unsigned int get_reorder_num(unsigned int v, unsigned int 
bit_size)
-{
-  v = ((v >> 1) & 0x55555555) | ((v & 0x55555555) << 1);
-  v = ((v >> 2) & 0x33333333) | ((v & 0x33333333) << 2);
-  v = ((v >> 4) & 0x0F0F0F0F) | ((v & 0x0F0F0F0F) << 4);
-  v = ((v >> 8) & 0x00FF00FF) | ((v & 0x00FF00FF) << 8);
-  v = (v >> 16) | (v << 16);
-  v = v >> (32 - bit_size);
-  return v;
-}
-
-template<typename Numeric2T, typename NumericT>
-__global__ void fft_direct(
-    const Numeric2T * input,
-    Numeric2T * output,
-    unsigned int size,
-    unsigned int stride,
-    unsigned int batch_num,
-    NumericT sign,
-    bool is_row_major)
-{
-
-  const NumericT NUM_PI(3.14159265358979323846);
-
-  for (unsigned int batch_id = 0; batch_id < batch_num; batch_id++)
-  {
-    for (unsigned int k = blockIdx.x * blockDim.x + threadIdx.x; k < size; k 
+= gridDim.x * blockDim.x)
-    {
-      Numeric2T f;
-      f.x = 0;
-      f.y = 0;
-
-      for (unsigned int n = 0; n < size; n++)
-      {
-        Numeric2T in;
-        if (!is_row_major)
-          in = input[batch_id * stride + n];   //input index here
-        else
-          in = input[n * stride + batch_id];//input index here
-
-        NumericT sn,cs;
-        NumericT arg = sign * 2 * NUM_PI * k / size * n;
-        sn = sin(arg);
-        cs = cos(arg);
-
-        Numeric2T ex;
-        ex.x = cs;
-        ex.y = sn;
-        Numeric2T tmp;
-        tmp.x = in.x * ex.x - in.y * ex.y;
-        tmp.y = in.x * ex.y + in.y * ex.x;
-        f = f + tmp;
-      }
-
-      if (!is_row_major)
-        output[batch_id * stride + k] = f; // output index here
-      else
-        output[k * stride + batch_id] = f;// output index here
-    }
-  }
-}
-
-/**
- * @brief Direct 1D algorithm for computing Fourier transformation.
- *
- * Works on any sizes of data.
- * Serial implementation has o(n^2) complexity
- */
-template<typename NumericT, unsigned int AlignmentV>
-void direct(viennacl::vector<NumericT, AlignmentV> const & in,
-            viennacl::vector<NumericT, AlignmentV>       & out,
-            vcl_size_t size, vcl_size_t stride, vcl_size_t batch_num,
-            NumericT sign = NumericT(-1),
-            
viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::DATA_ORDER 
data_order = 
viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::ROW_MAJOR)
-{
-  typedef typename 
viennacl::linalg::cuda::detail::type_to_type2<NumericT>::type  numeric2_type;
-
-  fft_direct<<<128,128>>>(reinterpret_cast<const numeric2_type 
*>(viennacl::cuda_arg(in)),
-                          reinterpret_cast<      numeric2_type 
*>(viennacl::cuda_arg(out)),
-                          static_cast<unsigned int>(size),
-                          static_cast<unsigned int>(stride),
-                          static_cast<unsigned int>(batch_num),
-                          sign,
-                          bool(data_order != 
viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::ROW_MAJOR));
-  VIENNACL_CUDA_LAST_ERROR_CHECK("fft_direct");
-}
-
-/**
- * @brief Direct 2D algorithm for computing Fourier transformation.
- *
- * Works on any sizes of data.
- * Serial implementation has o(n^2) complexity
- */
-template<typename NumericT, unsigned int AlignmentV>
-void direct(viennacl::matrix<NumericT, viennacl::row_major, AlignmentV> const 
& in,
-            viennacl::matrix<NumericT, viennacl::row_major, AlignmentV>       
& out,
-            vcl_size_t size, vcl_size_t stride, vcl_size_t batch_num,
-            NumericT sign = NumericT(-1),
-            
viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::DATA_ORDER 
data_order = 
viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::ROW_MAJOR)
-{
-  typedef typename 
viennacl::linalg::cuda::detail::type_to_type2<NumericT>::type  numeric2_type;
-
-  fft_direct<<<128,128>>>(reinterpret_cast<const numeric2_type 
*>(viennacl::cuda_arg(in)),
-                          reinterpret_cast<      numeric2_type 
*>(viennacl::cuda_arg(out)),
-                          static_cast<unsigned int>(size),
-                          static_cast<unsigned int>(stride),
-                          static_cast<unsigned int>(batch_num),
-                          sign,
-                          bool(data_order != 
viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::ROW_MAJOR));
-  VIENNACL_CUDA_LAST_ERROR_CHECK("fft_direct");
-}
-
-template<typename NumericT>
-__global__ void fft_reorder(NumericT * input,
-                            unsigned int bit_size,
-                            unsigned int size,
-                            unsigned int stride,
-                            unsigned int batch_num,
-                            bool is_row_major)
-{
-
-  unsigned int glb_id = blockIdx.x * blockDim.x + threadIdx.x;
-  unsigned int glb_sz = gridDim.x * blockDim.x;
-
-  for (unsigned int batch_id = 0; batch_id < batch_num; batch_id++)
-  {
-    for (unsigned int i = glb_id; i < size; i += glb_sz)
-    {
-      unsigned int v = get_reorder_num(i, bit_size);
-
-      if (i < v)
-      {
-        if (!is_row_major)
-        {
-          NumericT tmp = input[batch_id * stride + i];    // index
-          input[batch_id * stride + i] = input[batch_id * stride + v];//index
-          input[batch_id * stride + v] = tmp;//index
-        }
-        else
-        {
-          NumericT tmp = input[i * stride + batch_id];
-          input[i * stride + batch_id] = input[v * stride + batch_id];
-          input[v * stride + batch_id] = tmp;
-        }
-      }
-    }
-  }
-}
-
-/***
- * This function performs reorder of input data. Indexes are sorted in 
bit-reversal order.
- * Such reordering should be done before in-place FFT.
- */
-template<typename NumericT, unsigned int AlignmentV>
-void reorder(viennacl::vector<NumericT, AlignmentV> & in,
-             vcl_size_t size, vcl_size_t stride, vcl_size_t bits_datasize, 
vcl_size_t batch_num,
-             
viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::DATA_ORDER 
data_order = 
viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::ROW_MAJOR)
-{
-  typedef typename 
viennacl::linalg::cuda::detail::type_to_type2<NumericT>::type  numeric2_type;
-
-  fft_reorder<<<128,128>>>(reinterpret_cast<numeric2_type 
*>(viennacl::cuda_arg(in)),
-                           static_cast<unsigned int>(bits_datasize),
-                           static_cast<unsigned int>(size),
-                           static_cast<unsigned int>(stride),
-                           static_cast<unsigned int>(batch_num),
-                           static_cast<bool>(data_order));
-  VIENNACL_CUDA_LAST_ERROR_CHECK("fft_reorder");
-}
-
-template<typename Numeric2T, typename NumericT>
-__global__ void fft_radix2_local(Numeric2T * input,
-                                 unsigned int bit_size,
-                                 unsigned int size,
-                                 unsigned int stride,
-                                 unsigned int batch_num,
-                                 NumericT sign,
-                                 bool is_row_major)
-{
-  __shared__ Numeric2T lcl_input[1024];
-  unsigned int grp_id = blockIdx.x;
-  unsigned int grp_num = gridDim.x;
-
-  unsigned int lcl_sz = blockDim.x;
-  unsigned int lcl_id = threadIdx.x;
-  const NumericT NUM_PI(3.14159265358979323846);
-
-  for (unsigned int batch_id = grp_id; batch_id < batch_num; batch_id += 
grp_num)
-  {
-    for (unsigned int p = lcl_id; p < size; p += lcl_sz)
-    {
-      unsigned int v = get_reorder_num(p, bit_size);
-      if (!is_row_major)
-        lcl_input[v] = input[batch_id * stride + p];
-      else
-        lcl_input[v] = input[p * stride + batch_id];
-    }
-
-    __syncthreads();
-
-    //performs Cooley-Tukey FFT on local arrayfft
-    for (unsigned int s = 0; s < bit_size; s++)
-    {
-      unsigned int ss = 1 << s;
-      NumericT cs, sn;
-      for (unsigned int tid = lcl_id; tid < size; tid += lcl_sz)
-      {
-        unsigned int group = (tid & (ss - 1));
-        unsigned int pos = ((tid >> s) << (s + 1)) + group;
-
-        Numeric2T in1 = lcl_input[pos];
-        Numeric2T in2 = lcl_input[pos + ss];
-
-        NumericT arg = group * sign * NUM_PI / ss;
-
-        sn = sin(arg);
-        cs = cos(arg);
-        Numeric2T ex;
-        ex.x = cs;
-        ex.y = sn;
-
-        Numeric2T tmp;
-        tmp.x = in2.x * ex.x - in2.y * ex.y;
-        tmp.y = in2.x * ex.y + in2.y * ex.x;
-
-        lcl_input[pos + ss] = in1 - tmp;
-        lcl_input[pos]      = in1 + tmp;
-      }
-      __syncthreads();
-    }
-
-    //copy local array back to global memory
-    for (unsigned int p = lcl_id; p < size; p += lcl_sz)
-    {
-      if (!is_row_major)
-        input[batch_id * stride + p] = lcl_input[p];   //index
-      else
-        input[p * stride + batch_id] = lcl_input[p];
-    }
-
-  }
-}
-
-template<typename Numeric2T, typename NumericT>
-__global__ void fft_radix2(Numeric2T * input,
-                           unsigned int s,
-                           unsigned int bit_size,
-                           unsigned int size,
-                           unsigned int stride,
-                           unsigned int batch_num,
-                           NumericT sign,
-                           bool is_row_major)
-{
-
-  unsigned int ss = 1 << s;
-  unsigned int half_size = size >> 1;
-
-  NumericT cs, sn;
-  const NumericT NUM_PI(3.14159265358979323846);
-
-  unsigned int glb_id = blockIdx.x * blockDim.x + threadIdx.x;
-  unsigned int glb_sz = gridDim.x * blockDim.x;
-
-  for (unsigned int batch_id = 0; batch_id < batch_num; batch_id++)
-  {
-    for (unsigned int tid = glb_id; tid < half_size; tid += glb_sz)
-    {
-      unsigned int group = (tid & (ss - 1));
-      unsigned int pos = ((tid >> s) << (s + 1)) + group;
-      Numeric2T in1;
-      Numeric2T in2;
-      unsigned int offset;
-      if (!is_row_major)
-      {
-        offset = batch_id * stride + pos;
-        in1 = input[offset];   //index
-        in2 = input[offset + ss];//index
-      }
-      else
-      {
-        offset = pos * stride + batch_id;
-        in1 = input[offset];   //index
-        in2 = input[offset + ss * stride];//index
-      }
-
-      NumericT arg = group * sign * NUM_PI / ss;
-
-      sn = sin(arg);
-      cs = cos(arg);
-
-      Numeric2T ex;
-      ex.x = cs;
-      ex.y = sn;
-
-      Numeric2T tmp;
-      tmp.x = in2.x * ex.x - in2.y * ex.y;
-      tmp.y = in2.x * ex.y + in2.y * ex.x;
-
-      if (!is_row_major)
-        input[offset + ss] = in1 - tmp;  //index
-      else
-        input[offset + ss * stride] = in1 - tmp;  //index
-      input[offset] = in1 + tmp;  //index
-    }
-  }
-}
-
-/**
- * @brief Radix-2 1D algorithm for computing Fourier transformation.
- *
- * Works only on power-of-two sizes of data.
- * Serial implementation has o(n * lg n) complexity.
- * This is a Cooley-Tukey algorithm
- */
-template<typename NumericT, unsigned int AlignmentV>
-void radix2(viennacl::vector<NumericT, AlignmentV> & in,
-            vcl_size_t size, vcl_size_t stride, vcl_size_t batch_num, NumericT 
sign = NumericT(-1),
-            
viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::DATA_ORDER 
data_order = 
viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::ROW_MAJOR)
-{
-  typedef typename 
viennacl::linalg::cuda::detail::type_to_type2<NumericT>::type  numeric2_type;
-
-  unsigned int bit_size = viennacl::linalg::cuda::detail::fft::num_bits(size);
-
-  if (size <= viennacl::linalg::cuda::detail::fft::MAX_LOCAL_POINTS_NUM)
-  {
-    fft_radix2_local<<<128,128>>>(reinterpret_cast<numeric2_type 
*>(viennacl::cuda_arg(in)),
-                                  static_cast<unsigned int>(bit_size),
-                                  static_cast<unsigned int>(size),
-                                  static_cast<unsigned int>(stride),
-                                  static_cast<unsigned int>(batch_num),
-                                  static_cast<NumericT>(sign),
-                                  static_cast<bool>(data_order));
-    VIENNACL_CUDA_LAST_ERROR_CHECK("fft_radix2_local");
-  }
-  else
-  {
-    fft_reorder<<<128,128>>>(reinterpret_cast<numeric2_type 
*>(viennacl::cuda_arg(in)),
-                             static_cast<unsigned int>(bit_size),
-                             static_cast<unsigned int>(size),
-                             static_cast<unsigned int>(stride),
-                             static_cast<unsigned int>(batch_num),
-                             static_cast<bool>(data_order));
-    VIENNACL_CUDA_LAST_ERROR_CHECK("fft_reorder");
-
-    for (vcl_size_t step = 0; step < bit_size; step++)
-    {
-      fft_radix2<<<128,128>>>(reinterpret_cast<numeric2_type 
*>(viennacl::cuda_arg(in)),
-                              static_cast<unsigned int>(step),
-                              static_cast<unsigned int>(bit_size),
-                              static_cast<unsigned int>(size),
-                              static_cast<unsigned int>(stride),
-                              static_cast<unsigned int>(batch_num),
-                              sign,
-                              static_cast<bool>(data_order));
-      VIENNACL_CUDA_LAST_ERROR_CHECK("fft_radix2");
-    }
-  }
-}
-
-/**
- * @brief Radix-2 2D algorithm for computing Fourier transformation.
- *
- * Works only on power-of-two sizes of data.
- * Serial implementation has o(n * lg n) complexity.
- * This is a Cooley-Tukey algorithm
- */
-template<typename NumericT, unsigned int AlignmentV>
-void radix2(viennacl::matrix<NumericT, viennacl::row_major, AlignmentV>& in,
-            vcl_size_t size, vcl_size_t stride, vcl_size_t batch_num, NumericT 
sign = NumericT(-1),
-            
viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::DATA_ORDER 
data_order = 
viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::ROW_MAJOR)
-{
-  typedef typename 
viennacl::linalg::cuda::detail::type_to_type2<NumericT>::type  numeric2_type;
-
-  unsigned int bit_size = viennacl::linalg::cuda::detail::fft::num_bits(size);
-
-  if (size <= viennacl::linalg::cuda::detail::fft::MAX_LOCAL_POINTS_NUM)
-  {
-    fft_radix2_local<<<128,128>>>(reinterpret_cast<numeric2_type 
*>(viennacl::cuda_arg(in)),
-                                  static_cast<unsigned int>(bit_size),
-                                  static_cast<unsigned int>(size),
-                                  static_cast<unsigned int>(stride),
-                                  static_cast<unsigned int>(batch_num),
-                                  sign,
-                                  static_cast<bool>(data_order));
-    VIENNACL_CUDA_LAST_ERROR_CHECK("fft_radix2_local");
-  }
-  else
-  {
-    fft_reorder<<<128,128>>>(reinterpret_cast<numeric2_type 
*>(viennacl::cuda_arg(in)),
-                             static_cast<unsigned int>(bit_size),
-                             static_cast<unsigned int>(size),
-                             static_cast<unsigned int>(stride),
-                             static_cast<unsigned int>(batch_num),
-                             static_cast<bool>(data_order));
-    VIENNACL_CUDA_LAST_ERROR_CHECK("fft_reorder");
-    for (vcl_size_t step = 0; step < bit_size; step++)
-    {
-      fft_radix2<<<128,128>>>(reinterpret_cast<numeric2_type 
*>(viennacl::cuda_arg(in)),
-                              static_cast<unsigned int>(step),
-                              static_cast<unsigned int>(bit_size),
-                              static_cast<unsigned int>(size),
-                              static_cast<unsigned int>(stride),
-                              static_cast<unsigned int>(batch_num),
-                              sign,
-                              static_cast<bool>(data_order));
-      VIENNACL_CUDA_LAST_ERROR_CHECK("fft_radix2");
-    }
-  }
-}
-
-template<typename Numeric2T, typename NumericT>
-__global__ void bluestein_post(Numeric2T * Z, Numeric2T * out, unsigned int 
size, NumericT sign)
-{
-  unsigned int glb_id = blockIdx.x * blockDim.x + threadIdx.x;
-  unsigned int glb_sz =gridDim.x * blockDim.x;
-
-  unsigned int double_size = size << 1;
-  NumericT sn_a, cs_a;
-  const NumericT NUM_PI(3.14159265358979323846);
-
-  for (unsigned int i = glb_id; i < size; i += glb_sz)
-  {
-    unsigned int rm = i * i % (double_size);
-    NumericT angle = (NumericT)rm / size * (-NUM_PI);
-
-    sn_a = sin(angle);
-    cs_a= cos(angle);
-
-    Numeric2T b_i;
-    b_i.x = cs_a;
-    b_i.y = sn_a;
-    out[i].x = Z[i].x * b_i.x - Z[i].y * b_i.y;
-    out[i].y = Z[i].x * b_i.y + Z[i].y * b_i.x;
-  }
-}
-
-template<typename Numeric2T, typename NumericT>
-__global__ void bluestein_pre(Numeric2T * input, Numeric2T * A, Numeric2T * B,
-                              unsigned int size, unsigned int ext_size, 
NumericT sign)
-{
-  unsigned int glb_id = blockIdx.x * blockDim.x + threadIdx.x;
-  unsigned int glb_sz = gridDim.x * blockDim.x;
-
-  unsigned int double_size = size << 1;
-
-  NumericT sn_a, cs_a;
-  const NumericT NUM_PI(3.14159265358979323846);
-
-  for (unsigned int i = glb_id; i < size; i += glb_sz)
-  {
-    unsigned int rm = i * i % (double_size);
-    NumericT angle = (NumericT)rm / size * NUM_PI;
-
-    sn_a = sin(-angle);
-    cs_a= cos(-angle);
-
-    Numeric2T a_i;
-    a_i.x =cs_a;
-    a_i.y =sn_a;
-
-    Numeric2T b_i;
-    b_i.x =cs_a;
-    b_i.y =-sn_a;
-
-    A[i].x = input[i].x * a_i.x - input[i].y * a_i.y;
-    A[i].y = input[i].x * a_i.y + input[i].y * a_i.x;
-    B[i] = b_i;
-
-    // very bad instruction, to be fixed
-    if (i)
-    B[ext_size - i] = b_i;
-  }
-}
-
-template<typename NumericT>
-__global__ void zero2(NumericT * input1, NumericT * input2, unsigned int size)
-{
-  for (unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; i < size; i += 
gridDim.x * blockDim.x)
-  {
-    input1[i].x = 0;
-    input1[i].y = 0;
-
-    input2[i].x = 0;
-    input2[i].y = 0;
-  }
-}
-
-/**
- * @brief Bluestein's algorithm for computing Fourier transformation.
- *
- * Currently,  Works only for sizes of input data which less than 2^16.
- * Uses a lot of additional memory, but should be fast for any size of data.
- * Serial implementation has something about o(n * lg n) complexity
- */
-template<typename NumericT, unsigned int AlignmentV>
-void bluestein(viennacl::vector<NumericT, AlignmentV> & in,
-               viennacl::vector<NumericT, AlignmentV> & out, vcl_size_t 
/*batch_num*/)
-{
-  typedef typename 
viennacl::linalg::cuda::detail::type_to_type2<NumericT>::type  numeric2_type;
-
-  vcl_size_t size = in.size() >> 1;
-  vcl_size_t ext_size = viennacl::linalg::cuda::detail::fft::next_power_2(2 * 
size - 1);
-
-  viennacl::vector<NumericT, AlignmentV> A(ext_size << 1);
-  viennacl::vector<NumericT, AlignmentV> B(ext_size << 1);
-  viennacl::vector<NumericT, AlignmentV> Z(ext_size << 1);
-
-  zero2<<<128,128>>>(reinterpret_cast<numeric2_type *>(viennacl::cuda_arg(A)),
-                     reinterpret_cast<numeric2_type *>(viennacl::cuda_arg(B)),
-                     static_cast<unsigned int>(ext_size));
-  VIENNACL_CUDA_LAST_ERROR_CHECK("zero2");
-
-  bluestein_pre<<<128,128>>>(reinterpret_cast<numeric2_type 
*>(viennacl::cuda_arg(in)),
-                             reinterpret_cast<numeric2_type 
*>(viennacl::cuda_arg(A)),
-                             reinterpret_cast<numeric2_type 
*>(viennacl::cuda_arg(B)),
-                             static_cast<unsigned int>(size),
-                             static_cast<unsigned int>(ext_size),
-                             NumericT(1));
-  VIENNACL_CUDA_LAST_ERROR_CHECK("bluestein_pre");
-
-  viennacl::linalg::convolve_i(A, B, Z);
-
-  bluestein_post<<<128,128>>>(reinterpret_cast<numeric2_type 
*>(viennacl::cuda_arg(Z)),
-                              reinterpret_cast<numeric2_type 
*>(viennacl::cuda_arg(out)),
-                              static_cast<unsigned int>(size),
-                              NumericT(1));
-  VIENNACL_CUDA_LAST_ERROR_CHECK("bluestein_post");
-}
-
-template<typename NumericT>
-__global__ void fft_mult_vec(const NumericT * input1,
-                             const NumericT * input2,
-                             NumericT * output,
-                             unsigned int size)
-{
-  for (unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; i < size; i += 
gridDim.x * blockDim.x)
-  {
-    NumericT in1 = input1[i];
-    NumericT in2 = input2[i];
-    output[i] = in1 * in2;
-  }
-}
-
-/**
- * @brief Mutiply two complex vectors and store result in output
- */
-template<typename NumericT, unsigned int AlignmentV>
-void multiply_complex(viennacl::vector<NumericT, AlignmentV> const & input1,
-                      viennacl::vector<NumericT, AlignmentV> const & input2,
-                      viennacl::vector<NumericT, AlignmentV> & output)
-{
-  typedef typename 
viennacl::linalg::cuda::detail::type_to_type2<NumericT>::type  numeric2_type;
-
-  vcl_size_t size = input1.size() / 2;
-
-  fft_mult_vec<<<128,128>>>(reinterpret_cast<const numeric2_type 
*>(viennacl::cuda_arg(input1)),
-                            reinterpret_cast<const numeric2_type 
*>(viennacl::cuda_arg(input2)),
-                            reinterpret_cast<      numeric2_type 
*>(viennacl::cuda_arg(output)),
-                            static_cast<unsigned int>(size));
-  VIENNACL_CUDA_LAST_ERROR_CHECK("fft_mult_vec");
-}
-
-template<typename Numeric2T, typename NumericT>
-__global__ void fft_div_vec_scalar(Numeric2T * input1, unsigned int size, 
NumericT factor)
-{
-  for (unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; i < size; i += 
gridDim.x*blockDim.x)
-    input1[i] = input1[i]/factor;
-}
-
-/**
- * @brief Normalize vector on with his own size
- */
-template<typename NumericT, unsigned int AlignmentV>
-void normalize(viennacl::vector<NumericT, AlignmentV> & input)
-{
-  typedef typename 
viennacl::linalg::cuda::detail::type_to_type2<NumericT>::type  numeric2_type;
-
-  vcl_size_t size = input.size() >> 1;
-  NumericT norm_factor = static_cast<NumericT>(size);
-  fft_div_vec_scalar<<<128,128>>>(reinterpret_cast<numeric2_type 
*>(viennacl::cuda_arg(input)),
-                                  static_cast<unsigned int>(size),
-                                  norm_factor);
-  VIENNACL_CUDA_LAST_ERROR_CHECK("fft_div_vec_scalar");
-}
-
-template<typename NumericT>
-__global__ void transpose(const NumericT * input,
-                          NumericT * output,
-                          unsigned int row_num,
-                          unsigned int col_num)
-{
-  unsigned int size = row_num * col_num;
-  for (unsigned int i =blockIdx.x * blockDim.x + threadIdx.x; i < size; i+= 
gridDim.x * blockDim.x)
-  {
-    unsigned int row = i / col_num;
-    unsigned int col = i - row*col_num;
-    unsigned int new_pos = col * row_num + row;
-    output[new_pos] = input[i];
-  }
-}
-
-/**
- * @brief Transpose matrix
- */
-template<typename NumericT, unsigned int AlignmentV>
-void transpose(viennacl::matrix<NumericT, viennacl::row_major, AlignmentV> 
const & input,
-               viennacl::matrix<NumericT, viennacl::row_major, AlignmentV> & 
output)
-{
-  typedef typename 
viennacl::linalg::cuda::detail::type_to_type2<NumericT>::type  numeric2_type;
-
-  transpose<<<128,128>>>(reinterpret_cast<const numeric2_type 
*>(viennacl::cuda_arg(input)),
-                         reinterpret_cast<      numeric2_type 
*>(viennacl::cuda_arg(output)),
-                         static_cast<unsigned int>(input.internal_size1()>>1),
-                         static_cast<unsigned int>(input.internal_size2()>>1));
-  VIENNACL_CUDA_LAST_ERROR_CHECK("transpose");
-
-}
-
-template<typename NumericT>
-__global__ void transpose_inplace(
-    NumericT * input,
-    unsigned int row_num,
-    unsigned int col_num)
-{
-  unsigned int size = row_num * col_num;
-  for (unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; i < size; i+= 
gridDim.x * blockDim.x)
-  {
-    unsigned int row = i / col_num;
-    unsigned int col = i - row*col_num;
-    unsigned int new_pos = col * row_num + row;
-    if (i < new_pos)
-    {
-      NumericT val = input[i];
-      input[i] = input[new_pos];
-      input[new_pos] = val;
-    }
-  }
-}
-
-/**
- * @brief Inplace_transpose matrix
- */
-template<typename NumericT, unsigned int AlignmentV>
-void transpose(viennacl::matrix<NumericT, viennacl::row_major, AlignmentV> & 
input)
-{
-  typedef typename 
viennacl::linalg::cuda::detail::type_to_type2<NumericT>::type  numeric2_type;
-
-  transpose_inplace<<<128,128>>>(reinterpret_cast<numeric2_type 
*>(viennacl::cuda_arg(input)),
-                                 static_cast<unsigned 
int>(input.internal_size1()>>1),
-                                 static_cast<unsigned 
int>(input.internal_size2() >> 1));
-  VIENNACL_CUDA_LAST_ERROR_CHECK("transpose_inplace");
-
-}
-
-template<typename RealT,typename ComplexT>
-__global__ void real_to_complex(const RealT * in, ComplexT * out, unsigned int 
size)
-{
-  for (unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; i < size; i += 
gridDim.x * blockDim.x)
-  {
-    ComplexT val;
-    val.x = in[i];
-    val.y = 0;
-    out[i] = val;
-  }
-}
-
-/**
- * @brief Create complex vector from real vector (even elements(2*k) = real 
part, odd elements(2*k+1) = imaginary part)
- */
-template<typename NumericT>
-void real_to_complex(viennacl::vector_base<NumericT> const & in,
-                     viennacl::vector_base<NumericT> & out, vcl_size_t size)
-{
-  typedef typename 
viennacl::linalg::cuda::detail::type_to_type2<NumericT>::type  numeric2_type;
-
-  real_to_complex<<<128,128>>>(viennacl::cuda_arg(in),
-                               reinterpret_cast<numeric2_type 
*>(viennacl::cuda_arg(out)),
-                               static_cast<unsigned int>(size));
-  VIENNACL_CUDA_LAST_ERROR_CHECK("real_to_complex");
-}
-
-template<typename ComplexT,typename RealT>
-__global__ void complex_to_real(const ComplexT * in, RealT * out, unsigned int 
size)
-{
-  for (unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; i < size; i += 
gridDim.x * blockDim.x)
-    out[i] = in[i].x;
-}
-
-/**
- * @brief Create real vector from complex vector (even elements(2*k) = real 
part, odd elements(2*k+1) = imaginary part)
- */
-template<typename NumericT>
-void complex_to_real(viennacl::vector_base<NumericT> const & in,
-                     viennacl::vector_base<NumericT>& out, vcl_size_t size)
-{
-  typedef typename 
viennacl::linalg::cuda::detail::type_to_type2<NumericT>::type  numeric2_type;
-
-  complex_to_real<<<128,128>>>(reinterpret_cast<const numeric2_type 
*>(viennacl::cuda_arg(in)),
-                               viennacl::cuda_arg(out),
-                               static_cast<unsigned int>(size));
-  VIENNACL_CUDA_LAST_ERROR_CHECK("complex_to_real");
-
-}
-
-template<typename NumericT>
-__global__ void reverse_inplace(NumericT * vec, unsigned int size)
-{
-  for (unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; i < (size >> 
1); i+=gridDim.x * blockDim.x)
-  {
-    NumericT val1 = vec[i];
-    NumericT val2 = vec[size - i - 1];
-    vec[i] = val2;
-    vec[size - i - 1] = val1;
-  }
-}
-
-/**
- * @brief Reverse vector to oposite order and save it in input vector
- */
-template<typename NumericT>
-void reverse(viennacl::vector_base<NumericT>& in)
-{
-  vcl_size_t size = in.size();
-  reverse_inplace<<<128,128>>>(viennacl::cuda_arg(in), static_cast<unsigned 
int>(size));
-  VIENNACL_CUDA_LAST_ERROR_CHECK("reverse_inplace");
-}
-
-}  //namespace cuda
-}  //namespace linalg
-}  //namespace viennacl
-
-#endif /* FFT_OPERATIONS_HPP_ */

http://git-wip-us.apache.org/repos/asf/mahout/blob/7ae549fa/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/ilu_operations.hpp
----------------------------------------------------------------------
diff --git 
a/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/ilu_operations.hpp 
b/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/ilu_operations.hpp
deleted file mode 100644
index 302a73c..0000000
--- a/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/ilu_operations.hpp
+++ /dev/null
@@ -1,666 +0,0 @@
-#ifndef VIENNACL_LINALG_CUDA_ILU_OPERATIONS_HPP_
-#define VIENNACL_LINALG_CUDA_ILU_OPERATIONS_HPP_
-
-/* =========================================================================
-   Copyright (c) 2010-2016, Institute for Microelectronics,
-                            Institute for Analysis and Scientific Computing,
-                            TU Wien.
-   Portions of this software are copyright by UChicago Argonne, LLC.
-
-                            -----------------
-                  ViennaCL - The Vienna Computing Library
-                            -----------------
-
-   Project Head:    Karl Rupp                   [email protected]
-
-   (A list of authors and contributors can be found in the PDF manual)
-
-   License:         MIT (X11), see file LICENSE in the base directory
-============================================================================= 
*/
-
-/** @file viennacl/linalg/cuda/ilu_operations.hpp
-    @brief Implementations of specialized routines for the Chow-Patel parallel 
ILU preconditioner using CUDA
-*/
-
-#include <cmath>
-#include <algorithm>  //for std::max and std::min
-
-#include "viennacl/forwards.h"
-#include "viennacl/scalar.hpp"
-#include "viennacl/tools/tools.hpp"
-#include "viennacl/meta/predicate.hpp"
-#include "viennacl/meta/enable_if.hpp"
-#include "viennacl/traits/size.hpp"
-#include "viennacl/traits/start.hpp"
-#include "viennacl/linalg/vector_operations.hpp"
-#include "viennacl/traits/stride.hpp"
-
-
-namespace viennacl
-{
-namespace linalg
-{
-namespace cuda
-{
-
-template<typename IndexT> // to control external linkage
-__global__ void extract_L_kernel_1(
-          const IndexT * A_row_indices,
-          const IndexT * A_col_indices,
-          unsigned int A_size1,
-          unsigned int * L_row_indices)
-{
-  for (unsigned int row  = blockDim.x * blockIdx.x + threadIdx.x;
-                    row  < A_size1;
-                    row += gridDim.x * blockDim.x)
-  {
-    unsigned int row_begin = A_row_indices[row];
-    unsigned int row_end   = A_row_indices[row+1];
-
-    unsigned int num_entries_L = 0;
-    for (unsigned int j=row_begin; j<row_end; ++j)
-    {
-      unsigned int col = A_col_indices[j];
-      if (col <= row)
-        ++num_entries_L;
-    }
-
-    L_row_indices[row] = num_entries_L;
-  }
-}
-
-template<typename NumericT>
-__global__ void extract_L_kernel_2(
-          unsigned int const *A_row_indices,
-          unsigned int const *A_col_indices,
-          NumericT     const *A_elements,
-          unsigned int A_size1,
-
-          unsigned int const *L_row_indices,
-          unsigned int       *L_col_indices,
-          NumericT           *L_elements)
-{
-  for (unsigned int row  = blockDim.x * blockIdx.x + threadIdx.x;
-                    row  < A_size1;
-                    row += gridDim.x * blockDim.x)
-  {
-    unsigned int row_begin = A_row_indices[row];
-    unsigned int row_end   = A_row_indices[row+1];
-
-    unsigned int index_L = L_row_indices[row];
-    for (unsigned int j = row_begin; j < row_end; ++j)
-    {
-      unsigned int col = A_col_indices[j];
-      NumericT value = A_elements[j];
-
-      if (col <= row)
-      {
-        L_col_indices[index_L] = col;
-        L_elements[index_L]    = value;
-        ++index_L;
-      }
-    }
-  }
-}
-
-template<typename NumericT>
-void extract_L(compressed_matrix<NumericT> const & A,
-               compressed_matrix<NumericT>       & L)
-{
-  //
-  // Step 1: Count elements in L and U:
-  //
-  extract_L_kernel_1<<<128, 128>>>(viennacl::cuda_arg<unsigned 
int>(A.handle1()),
-                                   viennacl::cuda_arg<unsigned 
int>(A.handle2()),
-                                   static_cast<unsigned int>(A.size1()),
-                                   viennacl::cuda_arg<unsigned 
int>(L.handle1())
-                                  );
-  VIENNACL_CUDA_LAST_ERROR_CHECK("extract_L_kernel_1");
-
-  //
-  // Step 2: Exclusive scan on row_buffers:
-  //
-  viennacl::vector<unsigned int> 
wrapped_L_row_buffer(viennacl::cuda_arg<unsigned 
int>(L.handle1().cuda_handle()), viennacl::CUDA_MEMORY, A.size1() + 1);
-  viennacl::linalg::exclusive_scan(wrapped_L_row_buffer, wrapped_L_row_buffer);
-  L.reserve(wrapped_L_row_buffer[L.size1()], false);
-
-  //
-  // Step 3: Write entries
-  //
-  extract_L_kernel_2<<<128, 128>>>(viennacl::cuda_arg<unsigned 
int>(A.handle1()),
-                                   viennacl::cuda_arg<unsigned 
int>(A.handle2()),
-                                   viennacl::cuda_arg<NumericT>(A.handle()),
-                                   static_cast<unsigned int>(A.size1()),
-                                   viennacl::cuda_arg<unsigned 
int>(L.handle1()),
-                                   viennacl::cuda_arg<unsigned 
int>(L.handle2()),
-                                   viennacl::cuda_arg<NumericT>(L.handle())
-                                  );
-  VIENNACL_CUDA_LAST_ERROR_CHECK("extract_L_kernel_2");
-
-  L.generate_row_block_information();
-
-} // extract_L
-
-///////////////////////////////////////////////
-
-
-template<typename NumericT>
-__global__ void ilu_scale_kernel_1(
-          unsigned int const *A_row_indices,
-          unsigned int const *A_col_indices,
-          NumericT     const *A_elements,
-          unsigned int A_size1,
-
-          NumericT           *D_elements)
-{
-  for (unsigned int row  = blockDim.x * blockIdx.x + threadIdx.x;
-                    row  < A_size1;
-                    row += gridDim.x * blockDim.x)
-  {
-    unsigned int row_begin = A_row_indices[row];
-    unsigned int row_end   = A_row_indices[row+1];
-
-    for (unsigned int j = row_begin; j < row_end; ++j)
-    {
-      unsigned int col = A_col_indices[j];
-      if (row == col)
-      {
-        D_elements[row] = NumericT(1) / sqrt(fabs(A_elements[j]));
-        break;
-      }
-    }
-  }
-}
-
-/** @brief Scales values in a matrix such that output = D * input * D, where D 
is a diagonal matrix (only the diagonal is provided) */
-template<typename NumericT>
-__global__ void ilu_scale_kernel_2(
-          unsigned int const *R_row_indices,
-          unsigned int const *R_col_indices,
-          NumericT           *R_elements,
-          unsigned int R_size1,
-
-          NumericT           *D_elements)
-{
-  for (unsigned int row  = blockDim.x * blockIdx.x + threadIdx.x;
-                    row  < R_size1;
-                    row += gridDim.x * blockDim.x)
-  {
-    unsigned int row_begin = R_row_indices[row];
-    unsigned int row_end   = R_row_indices[row+1];
-
-    NumericT D_row = D_elements[row];
-
-    for (unsigned int j = row_begin; j < row_end; ++j)
-      R_elements[j] *= D_row * D_elements[R_col_indices[j]];
-  }
-}
-
-
-
-/** @brief Scales the values extracted from A such that A' = DAD has unit 
diagonal. Updates values from A in L and U accordingly. */
-template<typename NumericT>
-void icc_scale(compressed_matrix<NumericT> const & A,
-               compressed_matrix<NumericT>       & L)
-{
-  viennacl::vector<NumericT> D(A.size1(), viennacl::traits::context(A));
-
-  // fill D:
-  ilu_scale_kernel_1<<<128, 128>>>(viennacl::cuda_arg<unsigned 
int>(A.handle1()),
-                                   viennacl::cuda_arg<unsigned 
int>(A.handle2()),
-                                   viennacl::cuda_arg<NumericT>(A.handle()),
-                                   static_cast<unsigned int>(A.size1()),
-                                   viennacl::cuda_arg(D)
-                                  );
-  VIENNACL_CUDA_LAST_ERROR_CHECK("ilu_scale_kernel_1");
-
-  // scale L:
-  ilu_scale_kernel_2<<<128, 128>>>(viennacl::cuda_arg<unsigned 
int>(L.handle1()),
-                                   viennacl::cuda_arg<unsigned 
int>(L.handle2()),
-                                   viennacl::cuda_arg<NumericT>(L.handle()),
-                                   static_cast<unsigned int>(L.size1()),
-                                   viennacl::cuda_arg(D)
-                                  );
-  VIENNACL_CUDA_LAST_ERROR_CHECK("ilu_scale_kernel_1");
-}
-
-/////////////////////////////////////
-
-/** @brief CUDA kernel for one Chow-Patel-ICC sweep */
-template<typename NumericT>
-__global__ void icc_chow_patel_sweep_kernel(
-          unsigned int const *L_row_indices,
-          unsigned int const *L_col_indices,
-          NumericT           *L_elements,
-          NumericT     const *L_backup,
-          unsigned int L_size1,
-          NumericT     const *aij_L)
-{
-  for (unsigned int row  = blockDim.x * blockIdx.x + threadIdx.x;
-                    row  < L_size1;
-                    row += gridDim.x * blockDim.x)
-  {
-    //
-    // update L:
-    //
-    unsigned int row_Li_start = L_row_indices[row];
-    unsigned int row_Li_end   = L_row_indices[row + 1];
-
-    for (unsigned int i = row_Li_start; i < row_Li_end; ++i)
-    {
-      unsigned int col = L_col_indices[i];
-
-      unsigned int row_Lj_start = L_row_indices[col];
-      unsigned int row_Lj_end   = L_row_indices[col + 1];
-
-      // compute \sum_{k=1}^{j-1} l_ik u_kj
-      unsigned int index_Lj = row_Lj_start;
-      unsigned int col_Lj = L_col_indices[index_Lj];
-      NumericT s = aij_L[i];
-      for (unsigned int index_Li = row_Li_start; index_Li < i; ++index_Li)
-      {
-        unsigned int col_Li = L_col_indices[index_Li];
-
-        // find element in U
-        while (col_Lj < col_Li)
-        {
-          ++index_Lj;
-          col_Lj = L_col_indices[index_Lj];
-        }
-
-        if (col_Lj == col_Li)
-          s -= L_backup[index_Li] * L_backup[index_Lj];
-      }
-
-      // update l_ij:
-      L_elements[i] = (row == col) ? sqrt(s) : (s / L_backup[row_Lj_end - 1]); 
 // diagonal element is last entry in U
-    }
-
-  }
-}
-
-
-/** @brief Performs one nonlinear relaxation step in the Chow-Patel-ILU using 
OpenMP (cf. Algorithm 2 in paper) */
-template<typename NumericT>
-void icc_chow_patel_sweep(compressed_matrix<NumericT>       & L,
-                          vector<NumericT>            const & aij_L)
-{
-  viennacl::backend::mem_handle L_backup;
-  viennacl::backend::memory_create(L_backup, L.handle().raw_size(), 
viennacl::traits::context(L));
-  viennacl::backend::memory_copy(L.handle(), L_backup, 0, 0, 
L.handle().raw_size());
-
-  icc_chow_patel_sweep_kernel<<<128, 128>>>(viennacl::cuda_arg<unsigned 
int>(L.handle1()),
-                                            viennacl::cuda_arg<unsigned 
int>(L.handle2()),
-                                            
viennacl::cuda_arg<NumericT>(L.handle()),
-                                            
viennacl::cuda_arg<NumericT>(L_backup),
-                                            static_cast<unsigned 
int>(L.size1()),
-
-                                            
viennacl::cuda_arg<NumericT>(aij_L.handle())
-                                           );
-  VIENNACL_CUDA_LAST_ERROR_CHECK("icc_chow_patel_sweep_kernel");
-
-}
-
-
-////////////////////////////// ILU ///////////////////////////
-
-template<typename IndexT> // to control external linkage
-__global__ void extract_LU_kernel_1(
-          const IndexT * A_row_indices,
-          const IndexT * A_col_indices,
-          unsigned int A_size1,
-
-          unsigned int * L_row_indices,
-
-          unsigned int * U_row_indices)
-{
-  for (unsigned int row  = blockDim.x * blockIdx.x + threadIdx.x;
-                    row  < A_size1;
-                    row += gridDim.x * blockDim.x)
-  {
-    unsigned int row_begin = A_row_indices[row];
-    unsigned int row_end   = A_row_indices[row+1];
-
-    unsigned int num_entries_L = 0;
-    unsigned int num_entries_U = 0;
-    for (unsigned int j=row_begin; j<row_end; ++j)
-    {
-      unsigned int col = A_col_indices[j];
-      if (col <= row)
-        ++num_entries_L;
-      if (col >= row)
-        ++num_entries_U;
-    }
-
-    L_row_indices[row] = num_entries_L;
-    U_row_indices[row] = num_entries_U;
-  }
-}
-
-template<typename NumericT>
-__global__ void extract_LU_kernel_2(
-          unsigned int const *A_row_indices,
-          unsigned int const *A_col_indices,
-          NumericT     const *A_elements,
-          unsigned int A_size1,
-
-          unsigned int const *L_row_indices,
-          unsigned int       *L_col_indices,
-          NumericT           *L_elements,
-
-          unsigned int const *U_row_indices,
-          unsigned int       *U_col_indices,
-          NumericT           *U_elements)
-{
-  for (unsigned int row  = blockDim.x * blockIdx.x + threadIdx.x;
-                    row  < A_size1;
-                    row += gridDim.x * blockDim.x)
-  {
-    unsigned int row_begin = A_row_indices[row];
-    unsigned int row_end   = A_row_indices[row+1];
-
-    unsigned int index_L = L_row_indices[row];
-    unsigned int index_U = U_row_indices[row];
-    for (unsigned int j = row_begin; j < row_end; ++j)
-    {
-      unsigned int col = A_col_indices[j];
-      NumericT value = A_elements[j];
-
-      if (col <= row)
-      {
-        L_col_indices[index_L] = col;
-        L_elements[index_L]    = value;
-        ++index_L;
-      }
-
-      if (col >= row)
-      {
-        U_col_indices[index_U] = col;
-        U_elements[index_U]    = value;
-        ++index_U;
-      }
-    }
-  }
-}
-
-template<typename NumericT>
-void extract_LU(compressed_matrix<NumericT> const & A,
-                compressed_matrix<NumericT>       & L,
-                compressed_matrix<NumericT>       & U)
-{
-  //
-  // Step 1: Count elements in L and U:
-  //
-  extract_LU_kernel_1<<<128, 128>>>(viennacl::cuda_arg<unsigned 
int>(A.handle1()),
-                                    viennacl::cuda_arg<unsigned 
int>(A.handle2()),
-                                    static_cast<unsigned int>(A.size1()),
-                                    viennacl::cuda_arg<unsigned 
int>(L.handle1()),
-                                    viennacl::cuda_arg<unsigned 
int>(U.handle1())
-                                   );
-  VIENNACL_CUDA_LAST_ERROR_CHECK("extract_LU_kernel_1");
-
-  //
-  // Step 2: Exclusive scan on row_buffers:
-  //
-  viennacl::vector<unsigned int> 
wrapped_L_row_buffer(viennacl::cuda_arg<unsigned int>(L.handle1()), 
viennacl::CUDA_MEMORY, A.size1() + 1);
-  viennacl::linalg::exclusive_scan(wrapped_L_row_buffer, wrapped_L_row_buffer);
-  L.reserve(wrapped_L_row_buffer[L.size1()], false);
-
-  viennacl::vector<unsigned int> 
wrapped_U_row_buffer(viennacl::cuda_arg<unsigned int>(U.handle1()), 
viennacl::CUDA_MEMORY, A.size1() + 1);
-  viennacl::linalg::exclusive_scan(wrapped_U_row_buffer, wrapped_U_row_buffer);
-  U.reserve(wrapped_U_row_buffer[U.size1()], false);
-
-  //
-  // Step 3: Write entries
-  //
-  extract_LU_kernel_2<<<128, 128>>>(viennacl::cuda_arg<unsigned 
int>(A.handle1()),
-                                    viennacl::cuda_arg<unsigned 
int>(A.handle2()),
-                                    viennacl::cuda_arg<NumericT>(A.handle()),
-                                    static_cast<unsigned int>(A.size1()),
-                                    viennacl::cuda_arg<unsigned 
int>(L.handle1()),
-                                    viennacl::cuda_arg<unsigned 
int>(L.handle2()),
-                                    viennacl::cuda_arg<NumericT>(L.handle()),
-                                    viennacl::cuda_arg<unsigned 
int>(U.handle1()),
-                                    viennacl::cuda_arg<unsigned 
int>(U.handle2()),
-                                    viennacl::cuda_arg<NumericT>(U.handle())
-                                   );
-  VIENNACL_CUDA_LAST_ERROR_CHECK("extract_LU_kernel_2");
-
-  L.generate_row_block_information();
-  // Note: block information for U will be generated after transposition
-
-} // extract_LU
-
-///////////////////////////////////////////////
-
-/** @brief Scales the values extracted from A such that A' = DAD has unit 
diagonal. Updates values from A in L and U accordingly. */
-template<typename NumericT>
-void ilu_scale(compressed_matrix<NumericT> const & A,
-               compressed_matrix<NumericT>       & L,
-               compressed_matrix<NumericT>       & U)
-{
-  viennacl::vector<NumericT> D(A.size1(), viennacl::traits::context(A));
-
-  // fill D:
-  ilu_scale_kernel_1<<<128, 128>>>(viennacl::cuda_arg<unsigned 
int>(A.handle1()),
-                                   viennacl::cuda_arg<unsigned 
int>(A.handle2()),
-                                   viennacl::cuda_arg<NumericT>(A.handle()),
-                                   static_cast<unsigned int>(A.size1()),
-                                   viennacl::cuda_arg<NumericT>(D.handle())
-                                  );
-  VIENNACL_CUDA_LAST_ERROR_CHECK("ilu_scale_kernel_1");
-
-  // scale L:
-  ilu_scale_kernel_2<<<128, 128>>>(viennacl::cuda_arg<unsigned 
int>(L.handle1()),
-                                   viennacl::cuda_arg<unsigned 
int>(L.handle2()),
-                                   viennacl::cuda_arg<NumericT>(L.handle()),
-                                   static_cast<unsigned int>(L.size1()),
-                                   viennacl::cuda_arg<NumericT>(D.handle())
-                                  );
-  VIENNACL_CUDA_LAST_ERROR_CHECK("ilu_scale_kernel_2");
-
-  // scale U:
-  ilu_scale_kernel_2<<<128, 128>>>(viennacl::cuda_arg<unsigned 
int>(U.handle1()),
-                                   viennacl::cuda_arg<unsigned 
int>(U.handle2()),
-                                   viennacl::cuda_arg<NumericT>(U.handle()),
-                                   static_cast<unsigned int>(U.size1()),
-                                   viennacl::cuda_arg<NumericT>(D.handle())
-                                  );
-  VIENNACL_CUDA_LAST_ERROR_CHECK("ilu_scale_kernel_2");
-}
-
-/////////////////////////////////////
-
-/** @brief CUDA kernel for one Chow-Patel-ILU sweep */
-template<typename NumericT>
-__global__ void ilu_chow_patel_sweep_kernel(
-          unsigned int const *L_row_indices,
-          unsigned int const *L_col_indices,
-          NumericT           *L_elements,
-          NumericT     const *L_backup,
-          unsigned int L_size1,
-
-          NumericT     const *aij_L,
-
-          unsigned int const *U_trans_row_indices,
-          unsigned int const *U_trans_col_indices,
-          NumericT           *U_trans_elements,
-          NumericT     const *U_trans_backup,
-
-          NumericT     const *aij_U_trans)
-{
-  for (unsigned int row  = blockDim.x * blockIdx.x + threadIdx.x;
-                    row  < L_size1;
-                    row += gridDim.x * blockDim.x)
-  {
-    //
-    // update L:
-    //
-    unsigned int row_L_start = L_row_indices[row];
-    unsigned int row_L_end   = L_row_indices[row + 1];
-
-    for (unsigned int j = row_L_start; j < row_L_end; ++j)
-    {
-      unsigned int col = L_col_indices[j];
-
-      if (col == row)
-        continue;
-
-      unsigned int row_U_start = U_trans_row_indices[col];
-      unsigned int row_U_end   = U_trans_row_indices[col + 1];
-
-      // compute \sum_{k=1}^{j-1} l_ik u_kj
-      unsigned int index_U = row_U_start;
-      unsigned int col_U = (index_U < row_U_end) ? 
U_trans_col_indices[index_U] : L_size1;
-      NumericT sum = 0;
-      for (unsigned int k = row_L_start; k < j; ++k)
-      {
-        unsigned int col_L = L_col_indices[k];
-
-        // find element in U
-        while (col_U < col_L)
-        {
-          ++index_U;
-          col_U = U_trans_col_indices[index_U];
-        }
-
-        if (col_U == col_L)
-          sum += L_backup[k] * U_trans_backup[index_U];
-      }
-
-      // update l_ij:
-      L_elements[j] = (aij_L[j] - sum) / U_trans_backup[row_U_end - 1];  // 
diagonal element is last entry in U
-    }
-
-
-    //
-    // update U:
-    //
-    unsigned int row_U_start = U_trans_row_indices[row];
-    unsigned int row_U_end   = U_trans_row_indices[row + 1];
-    for (unsigned int j = row_U_start; j < row_U_end; ++j)
-    {
-      unsigned int col = U_trans_col_indices[j];
-
-      row_L_start = L_row_indices[col];
-      row_L_end   = L_row_indices[col + 1];
-
-      // compute \sum_{k=1}^{j-1} l_ik u_kj
-      unsigned int index_L = row_L_start;
-      unsigned int col_L = (index_L < row_L_end) ? L_col_indices[index_L] : 
L_size1;
-      NumericT sum = 0;
-      for (unsigned int k = row_U_start; k < j; ++k)
-      {
-        unsigned int col_U = U_trans_col_indices[k];
-
-        // find element in L
-        while (col_L < col_U)
-        {
-          ++index_L;
-          col_L = L_col_indices[index_L];
-        }
-
-        if (col_U == col_L)
-          sum += L_backup[index_L] * U_trans_backup[k];
-      }
-
-      // update u_ij:
-      U_trans_elements[j] = aij_U_trans[j] - sum;
-    }
-  }
-}
-
-
-/** @brief Performs one nonlinear relaxation step in the Chow-Patel-ILU using 
OpenMP (cf. Algorithm 2 in paper) */
-template<typename NumericT>
-void ilu_chow_patel_sweep(compressed_matrix<NumericT>       & L,
-                          vector<NumericT>            const & aij_L,
-                          compressed_matrix<NumericT>       & U_trans,
-                          vector<NumericT>            const & aij_U_trans)
-{
-  viennacl::backend::mem_handle L_backup;
-  viennacl::backend::memory_create(L_backup, L.handle().raw_size(), 
viennacl::traits::context(L));
-  viennacl::backend::memory_copy(L.handle(), L_backup, 0, 0, 
L.handle().raw_size());
-
-  viennacl::backend::mem_handle U_backup;
-  viennacl::backend::memory_create(U_backup, U_trans.handle().raw_size(), 
viennacl::traits::context(U_trans));
-  viennacl::backend::memory_copy(U_trans.handle(), U_backup, 0, 0, 
U_trans.handle().raw_size());
-
-  ilu_chow_patel_sweep_kernel<<<128, 128>>>(viennacl::cuda_arg<unsigned 
int>(L.handle1()),
-                                            viennacl::cuda_arg<unsigned 
int>(L.handle2()),
-                                            
viennacl::cuda_arg<NumericT>(L.handle()),
-                                            
viennacl::cuda_arg<NumericT>(L_backup),
-                                            static_cast<unsigned 
int>(L.size1()),
-
-                                            
viennacl::cuda_arg<NumericT>(aij_L.handle()),
-
-                                            viennacl::cuda_arg<unsigned 
int>(U_trans.handle1()),
-                                            viennacl::cuda_arg<unsigned 
int>(U_trans.handle2()),
-                                            
viennacl::cuda_arg<NumericT>(U_trans.handle()),
-                                            
viennacl::cuda_arg<NumericT>(U_backup),
-
-                                            
viennacl::cuda_arg<NumericT>(aij_U_trans.handle())
-                                           );
-  VIENNACL_CUDA_LAST_ERROR_CHECK("ilu_chow_patel_sweep_kernel");
-
-}
-
-//////////////////////////////////////
-
-template<typename NumericT>
-__global__ void ilu_form_neumann_matrix_kernel(
-          unsigned int const *R_row_indices,
-          unsigned int const *R_col_indices,
-          NumericT           *R_elements,
-          unsigned int R_size1,
-
-          NumericT           *D_elements)
-{
-  for (unsigned int row  = blockDim.x * blockIdx.x + threadIdx.x;
-                    row  < R_size1;
-                    row += gridDim.x * blockDim.x)
-  {
-    unsigned int row_begin = R_row_indices[row];
-    unsigned int row_end   = R_row_indices[row+1];
-
-    // part 1: extract diagonal entry
-    NumericT diag = 0;
-    for (unsigned int j = row_begin; j < row_end; ++j)
-    {
-      unsigned int col = R_col_indices[j];
-      if (col == row)
-      {
-        diag = R_elements[j];
-        R_elements[j] = 0; // (I - D^{-1}R)
-        break;
-      }
-    }
-    D_elements[row] = diag;
-
-    // part2: scale
-    for (unsigned int j = row_begin; j < row_end; ++j)
-      R_elements[j] /= -diag;
-  }
-}
-
-
-
-template<typename NumericT>
-void ilu_form_neumann_matrix(compressed_matrix<NumericT> & R,
-                             vector<NumericT> & diag_R)
-{
-  ilu_form_neumann_matrix_kernel<<<128, 128>>>(viennacl::cuda_arg<unsigned 
int>(R.handle1()),
-                                               viennacl::cuda_arg<unsigned 
int>(R.handle2()),
-                                               
viennacl::cuda_arg<NumericT>(R.handle()),
-                                               static_cast<unsigned 
int>(R.size1()),
-                                               
viennacl::cuda_arg<NumericT>(diag_R.handle())
-                                              );
-  VIENNACL_CUDA_LAST_ERROR_CHECK("ilu_form_neumann_matrix_kernel");
-}
-
-} //namespace host_based
-} //namespace linalg
-} //namespace viennacl
-
-
-#endif

Reply via email to