http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/detail/amg/amg_base.hpp
----------------------------------------------------------------------
diff --git 
a/native-viennaCL/src/main/cpp/viennacl/linalg/detail/amg/amg_base.hpp 
b/native-viennaCL/src/main/cpp/viennacl/linalg/detail/amg/amg_base.hpp
new file mode 100644
index 0000000..8361308
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/linalg/detail/amg/amg_base.hpp
@@ -0,0 +1,208 @@
+#ifndef VIENNACL_LINALG_DETAIL_AMG_AMG_BASE_HPP_
+#define VIENNACL_LINALG_DETAIL_AMG_AMG_BASE_HPP_
+
+/* =========================================================================
+   Copyright (c) 2010-2016, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   [email protected]
+
+   (A list of authors and contributors can be found in the manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= 
*/
+
+/** @file amg_base.hpp
+    @brief Helper classes and functions for the AMG preconditioner. 
Experimental.
+
+    AMG code contributed by Markus Wagner
+*/
+
+#include <cmath>
+#include <set>
+#include <list>
+#include <stdexcept>
+#include <algorithm>
+
+#include <map>
+#ifdef VIENNACL_WITH_OPENMP
+#include <omp.h>
+#endif
+
+#include "viennacl/context.hpp"
+
+namespace viennacl
+{
+namespace linalg
+{
+
+/** @brief Enumeration of coarsening methods for algebraic multigrid. */
+enum amg_coarsening_method
+{
+  AMG_COARSENING_METHOD_ONEPASS = 1,
+  AMG_COARSENING_METHOD_AGGREGATION,
+  AMG_COARSENING_METHOD_MIS2_AGGREGATION
+};
+
+/** @brief Enumeration of interpolation methods for algebraic multigrid. */
+enum amg_interpolation_method
+{
+  AMG_INTERPOLATION_METHOD_DIRECT = 1,
+  AMG_INTERPOLATION_METHOD_AGGREGATION,
+  AMG_INTERPOLATION_METHOD_SMOOTHED_AGGREGATION
+};
+
+
+/** @brief A tag for algebraic multigrid (AMG). Used to transport information 
from the user to the implementation.
+*/
+class amg_tag
+{
+public:
+  /** @brief The constructor, setting default values for the various 
parameters.
+    *
+    * Default coarsening routine: Aggreggation based on maximum independent 
sets of distance (MIS-2)
+    * Default interpolation routine: Smoothed aggregation
+    * Default threshold for strong connections: 0.1 (customizations are 
recommeded!)
+    * Default weight for Jacobi smoother: 1.0
+    * Default number of pre-smooth operations: 2
+    * Default number of post-smooth operations: 2
+    * Default number of coarse levels: 0 (this indicates that as many coarse 
levels as needed are constructed until the cutoff is reached)
+    * Default coarse grid size for direct solver (coarsening cutoff): 50
+    */
+  amg_tag()
+  : coarsening_method_(AMG_COARSENING_METHOD_MIS2_AGGREGATION), 
interpolation_method_(AMG_INTERPOLATION_METHOD_AGGREGATION),
+    strong_connection_threshold_(0.1), jacobi_weight_(1.0),
+    presmooth_steps_(2), postsmooth_steps_(2),
+    coarse_levels_(0), coarse_cutoff_(50) {}
+
+  // Getter-/Setter-Functions
+  /** @brief Sets the strategy used for constructing coarse grids  */
+  void set_coarsening_method(amg_coarsening_method s) { coarsening_method_ = 
s; }
+  /** @brief Returns the current coarsening strategy */
+  amg_coarsening_method get_coarsening_method() const { return 
coarsening_method_; }
+
+  /** @brief Sets the interpolation method to the provided method */
+  void set_interpolation_method(amg_interpolation_method interpol) { 
interpolation_method_ = interpol; }
+  /** @brief Returns the current interpolation method */
+  amg_interpolation_method get_interpolation_method() const { return 
interpolation_method_; }
+
+  /** @brief Sets the strong connection threshold. Customizations by the user 
essential for best results!
+    *
+    * With classical interpolation, a connection is considered strong if 
|a_ij| >= threshold * max_k(|a_ik|)
+    * Strength of connection currently ignored for aggregation-based 
coarsening (to be added in the future).
+    */
+  void set_strong_connection_threshold(double threshold) { if (threshold > 0) 
strong_connection_threshold_ = threshold; }
+  /** @brief Returns the strong connection threshold parameter.
+    *
+    * @see set_strong_connection_threshold() for an explanation of the 
threshold parameter
+    */
+  double get_strong_connection_threshold() const { return 
strong_connection_threshold_; }
+
+  /** @brief Sets the weight (damping) for the Jacobi smoother.
+    *
+    * The optimal value depends on the problem at hand. Values of 0.67 or 1.0 
are usually a good starting point for further experiments.
+    */
+  void set_jacobi_weight(double w) { if (w > 0) jacobi_weight_ = w; }
+  /** @brief Returns the Jacobi smoother weight (damping). */
+  double get_jacobi_weight() const { return jacobi_weight_; }
+
+  /** @brief Sets the number of smoother applications on the fine level before 
restriction to the coarser level. */
+  void set_presmooth_steps(vcl_size_t steps) { presmooth_steps_ = steps; }
+  /** @brief Returns the number of smoother applications on the fine level 
before restriction to the coarser level. */
+  vcl_size_t get_presmooth_steps() const { return presmooth_steps_; }
+
+  /** @brief Sets the number of smoother applications on the coarse level 
before interpolation to the finer level. */
+  void set_postsmooth_steps(vcl_size_t steps) { postsmooth_steps_ = steps; }
+  /** @brief Returns the number of smoother applications on the coarse level 
before interpolation to the finer level. */
+  vcl_size_t get_postsmooth_steps() const { return postsmooth_steps_; }
+
+  /** @brief Sets the number of coarse levels. If set to zero, then coarse 
levels are constructed until the cutoff size is reached. */
+  void set_coarse_levels(vcl_size_t levels)  { coarse_levels_ = levels; }
+  /** @brief Returns the number of coarse levels. If zero, then coarse levels 
are constructed until the cutoff size is reached. */
+  vcl_size_t get_coarse_levels() const { return coarse_levels_; }
+
+  /** @brief Sets the coarse grid size for which the recursive multigrid 
scheme is stopped and a direct solver is used. */
+  void set_coarsening_cutoff(vcl_size_t size)  { coarse_cutoff_ = size; }
+  /** @brief Returns the coarse grid size for which the recursive multigrid 
scheme is stopped and a direct solver is used. */
+  vcl_size_t get_coarsening_cutoff() const { return coarse_cutoff_; }
+
+  /** @brief Sets the ViennaCL context for the setup stage. Set this to a host 
context if you want to run the setup on the host.
+    *
+    * Set the ViennaCL context for the solver application via 
set_target_context().
+    * Target and setup context can be different.
+    */
+  void set_setup_context(viennacl::context ctx)  { setup_ctx_ = ctx; }
+  /** @brief Returns the ViennaCL context for the preconditioenr setup. */
+  viennacl::context const & get_setup_context() const { return setup_ctx_; }
+
+  /** @brief Sets the ViennaCL context for the solver cycle stage (i.e. 
preconditioner applications).
+    *
+    * Since the cycle stage easily benefits from accelerators, you usually 
want to set this to a CUDA or OpenCL-enabled context.
+    */
+  void set_target_context(viennacl::context ctx)  { target_ctx_ = ctx; }
+  /** @brief Returns the ViennaCL context for the solver cycle stage (i.e. 
preconditioner applications). */
+  viennacl::context const & get_target_context() const { return target_ctx_; }
+
+private:
+  amg_coarsening_method coarsening_method_;
+  amg_interpolation_method interpolation_method_;
+  double strong_connection_threshold_, jacobi_weight_;
+  vcl_size_t presmooth_steps_, postsmooth_steps_, coarse_levels_, 
coarse_cutoff_;
+  viennacl::context setup_ctx_, target_ctx_;
+};
+
+
+namespace detail
+{
+namespace amg
+{
+
+
+  struct amg_level_context
+  {
+    void resize(vcl_size_t num_points, vcl_size_t max_nnz)
+    {
+      influence_jumper_.resize(num_points + 1, false);
+      influence_ids_.resize(max_nnz, false);
+      influence_values_.resize(num_points, false);
+      point_types_.resize(num_points, false);
+      coarse_id_.resize(num_points, false);
+    }
+
+    void switch_context(viennacl::context ctx)
+    {
+      influence_jumper_.switch_memory_context(ctx);
+      influence_ids_.switch_memory_context(ctx);
+      influence_values_.switch_memory_context(ctx);
+      point_types_.switch_memory_context(ctx);
+      coarse_id_.switch_memory_context(ctx);
+    }
+
+    enum
+    {
+      POINT_TYPE_UNDECIDED = 0,
+      POINT_TYPE_COARSE,
+      POINT_TYPE_FINE
+    } amg_point_types;
+
+    viennacl::vector<unsigned int> influence_jumper_; // similar to row_buffer 
for CSR matrices
+    viennacl::vector<unsigned int> influence_ids_;    // IDs of influencing 
points
+    viennacl::vector<unsigned int> influence_values_; // Influence measure for 
each point
+    viennacl::vector<unsigned int> point_types_;      // 0: undecided, 1: 
coarse point, 2: fine point. Using char here because type for enum might be a 
larger type
+    viennacl::vector<unsigned int> coarse_id_;        // coarse ID used on the 
next level. Only valid for coarse points. Fine points may (ab)use their entry 
for something else.
+    unsigned int num_coarse_;
+  };
+
+
+} //namespace amg
+}
+}
+}
+
+#endif

http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/detail/bisect/bisect_kernel_calls.hpp
----------------------------------------------------------------------
diff --git 
a/native-viennaCL/src/main/cpp/viennacl/linalg/detail/bisect/bisect_kernel_calls.hpp
 
b/native-viennaCL/src/main/cpp/viennacl/linalg/detail/bisect/bisect_kernel_calls.hpp
new file mode 100644
index 0000000..8308f77
--- /dev/null
+++ 
b/native-viennaCL/src/main/cpp/viennacl/linalg/detail/bisect/bisect_kernel_calls.hpp
@@ -0,0 +1,191 @@
+#ifndef VIENNACL_LINALG_DETAIL_BISECT_KERNEL_CALLS_HPP_
+#define VIENNACL_LINALG_DETAIL_BISECT_KERNEL_CALLS_HPP_
+
+/* =========================================================================
+   Copyright (c) 2010-2016, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   [email protected]
+
+   (A list of authors and contributors can be found in the manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= 
*/
+
+/** @file viennacl/linalg/detail/bisect/bisect_kernel_calls.hpp
+    @brief Kernel calls for the bisection algorithm
+
+    Implementation based on the sample provided with the CUDA 6.0 SDK, for 
which
+    the creation of derivative works is allowed by including the following 
statement:
+    "This software contains source code provided by NVIDIA Corporation."
+*/
+
+
+#include "viennacl/forwards.h"
+#include "viennacl/scalar.hpp"
+#include "viennacl/vector.hpp"
+#include "viennacl/vector_proxy.hpp"
+#include "viennacl/tools/tools.hpp"
+#include "viennacl/meta/enable_if.hpp"
+#include "viennacl/meta/predicate.hpp"
+#include "viennacl/meta/result_of.hpp"
+#include "viennacl/traits/size.hpp"
+#include "viennacl/traits/start.hpp"
+#include "viennacl/traits/handle.hpp"
+#include "viennacl/traits/stride.hpp"
+
+#include "viennacl/linalg/detail/bisect/structs.hpp"
+#ifdef VIENNACL_WITH_OPENCL
+   #include "viennacl/linalg/opencl/bisect_kernel_calls.hpp"
+#endif
+
+#ifdef VIENNACL_WITH_CUDA
+  #include "viennacl/linalg/cuda/bisect_kernel_calls.hpp"
+#endif
+
+namespace viennacl
+{
+namespace linalg
+{
+namespace detail
+{
+ template<typename NumericT>
+ void bisectSmall(const InputData<NumericT> &input, ResultDataSmall<NumericT> 
&result,
+                  const unsigned int mat_size,
+                  const NumericT lg, const NumericT ug,
+                  const NumericT precision)
+  {
+    switch (viennacl::traits::handle(input.g_a).get_active_handle_id())
+    {
+#ifdef VIENNACL_WITH_OPENCL
+      case viennacl::OPENCL_MEMORY:
+        viennacl::linalg::opencl::bisectSmall(input, result,
+                                             mat_size,
+                                             lg,ug,
+                                             precision);
+        break;
+#endif
+#ifdef VIENNACL_WITH_CUDA
+      case viennacl::CUDA_MEMORY:
+        viennacl::linalg::cuda::bisectSmall(input, result,
+                                             mat_size,
+                                             lg,ug,
+                                             precision);
+        break;
+#endif
+      case viennacl::MEMORY_NOT_INITIALIZED:
+        throw memory_exception("not initialised!");
+      default:
+        throw memory_exception("not implemented");
+    }
+  }
+
+
+
+
+ template<typename NumericT>
+ void bisectLarge(const InputData<NumericT> &input, ResultDataLarge<NumericT> 
&result,
+                    const unsigned int mat_size,
+                    const NumericT lg, const NumericT ug,
+                    const NumericT precision)
+  {
+    switch (viennacl::traits::handle(input.g_a).get_active_handle_id())
+    {
+#ifdef VIENNACL_WITH_OPENCL
+      case viennacl::OPENCL_MEMORY:
+        viennacl::linalg::opencl::bisectLarge(input, result,
+                                             mat_size,
+                                             lg,ug,
+                                             precision);
+        break;
+#endif
+#ifdef VIENNACL_WITH_CUDA
+      case viennacl::CUDA_MEMORY:
+        viennacl::linalg::cuda::bisectLarge(input, result,
+                                             mat_size,
+                                             lg,ug,
+                                             precision);
+        break;
+#endif
+      case viennacl::MEMORY_NOT_INITIALIZED:
+        throw memory_exception("not initialised!");
+      default:
+        throw memory_exception("not implemented");
+    }
+  }
+
+
+
+
+
+ template<typename NumericT>
+ void bisectLarge_OneIntervals(const InputData<NumericT> &input, 
ResultDataLarge<NumericT> &result,
+                    const unsigned int mat_size,
+                    const NumericT precision)
+  {
+    switch (viennacl::traits::handle(input.g_a).get_active_handle_id())
+    {
+#ifdef VIENNACL_WITH_OPENCL
+      case viennacl::OPENCL_MEMORY:
+        viennacl::linalg::opencl::bisectLargeOneIntervals(input, result,
+                                             mat_size,
+                                             precision);
+        break;
+#endif
+#ifdef VIENNACL_WITH_CUDA
+      case viennacl::CUDA_MEMORY:
+        viennacl::linalg::cuda::bisectLarge_OneIntervals(input, result,
+                                             mat_size,
+                                             precision);
+
+        break;
+#endif
+      case viennacl::MEMORY_NOT_INITIALIZED:
+        throw memory_exception("not initialised!");
+      default:
+        throw memory_exception("not implemented");
+    }
+  }
+
+
+
+
+ template<typename NumericT>
+ void bisectLarge_MultIntervals(const InputData<NumericT> &input, 
ResultDataLarge<NumericT> &result,
+                    const unsigned int mat_size,
+                    const NumericT precision)
+  {
+    switch (viennacl::traits::handle(input.g_a).get_active_handle_id())
+    {
+#ifdef VIENNACL_WITH_OPENCL
+      case viennacl::OPENCL_MEMORY:
+      viennacl::linalg::opencl::bisectLargeMultIntervals(input, result,
+                                           mat_size,
+                                           precision);
+        break;
+#endif
+#ifdef VIENNACL_WITH_CUDA
+      case viennacl::CUDA_MEMORY:
+        viennacl::linalg::cuda::bisectLarge_MultIntervals(input, result,
+                                             mat_size,
+                                             precision);
+        break;
+#endif
+      case viennacl::MEMORY_NOT_INITIALIZED:
+        throw memory_exception("not initialised!");
+      default:
+        throw memory_exception("not implemented");
+    }
+  }
+} // namespace detail
+} // namespace linalg
+} //namespace viennacl
+
+
+#endif

http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/detail/bisect/bisect_large.hpp
----------------------------------------------------------------------
diff --git 
a/native-viennaCL/src/main/cpp/viennacl/linalg/detail/bisect/bisect_large.hpp 
b/native-viennaCL/src/main/cpp/viennacl/linalg/detail/bisect/bisect_large.hpp
new file mode 100755
index 0000000..337858f
--- /dev/null
+++ 
b/native-viennaCL/src/main/cpp/viennacl/linalg/detail/bisect/bisect_large.hpp
@@ -0,0 +1,142 @@
+#ifndef VIENNACL_LINALG_DETAIL_BISECT_BISECT_LARGE_HPP_
+#define VIENNACL_LINALG_DETAIL_BISECT_BISECT_LARGE_HPP_
+
+/* =========================================================================
+   Copyright (c) 2010-2016, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   [email protected]
+
+   (A list of authors and contributors can be found in the manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= 
*/
+
+
+/** @file viennacl/linalg/detail//bisect/bisect_large.hpp
+    @brief Computation of eigenvalues of a large symmetric, tridiagonal matrix
+
+    Implementation based on the sample provided with the CUDA 6.0 SDK, for 
which
+    the creation of derivative works is allowed by including the following 
statement:
+    "This software contains source code provided by NVIDIA Corporation."
+*/
+
+
+// includes, system
+#include <iostream>
+#include <iomanip>  
+#include <stdlib.h>
+#include <stdio.h>
+
+// includes, project
+#include "viennacl/linalg/detail/bisect/config.hpp"
+#include "viennacl/linalg/detail/bisect/structs.hpp"
+#include "viennacl/linalg/detail/bisect/bisect_kernel_calls.hpp"
+
+namespace viennacl
+{
+namespace linalg
+{
+namespace detail
+{
+
+////////////////////////////////////////////////////////////////////////////////
+//! Run the kernels to compute the eigenvalues for large matrices
+//! @param  input   handles to input data
+//! @param  result  handles to result data
+//! @param  mat_size  matrix size
+//! @param  precision  desired precision of eigenvalues
+//! @param  lg  lower limit of Gerschgorin interval
+//! @param  ug  upper limit of Gerschgorin interval
+////////////////////////////////////////////////////////////////////////////////
+template<typename NumericT>
+void
+computeEigenvaluesLargeMatrix(InputData<NumericT> &input, 
ResultDataLarge<NumericT> &result,
+                              const unsigned int mat_size,
+                              const NumericT lg, const NumericT ug,  const 
NumericT precision)
+{
+
+
+  // First kernel call: decide on which intervals bisect_Large_OneIntervals/
+  // bisect_Large_MultIntervals is executed
+  viennacl::linalg::detail::bisectLarge(input, result, mat_size, lg, ug, 
precision);
+
+  // compute eigenvalues for intervals that contained only one eigenvalue
+  // after the first processing step
+  viennacl::linalg::detail::bisectLarge_OneIntervals(input, result, mat_size, 
precision);
+
+  // process intervals that contained more than one eigenvalue after
+  // the first processing step
+  viennacl::linalg::detail::bisectLarge_MultIntervals(input, result, mat_size, 
precision);
+
+}
+
+////////////////////////////////////////////////////////////////////////////////
+//! Process the result, that is obtain result from device and do simple sanity
+//! checking
+//! @param  result  handles to result data
+//! @param  mat_size  matrix size
+////////////////////////////////////////////////////////////////////////////////
+template<typename NumericT>
+bool
+processResultDataLargeMatrix(ResultDataLarge<NumericT> &result,
+                             const unsigned int mat_size)
+{
+    bool bCompareResult = true;
+    // copy data from intervals that contained more than one eigenvalue after
+    // the first processing step
+    std::vector<NumericT> lambda_mult(mat_size);
+    viennacl::copy(result.g_lambda_mult, lambda_mult);
+
+    std::vector<unsigned int> pos_mult(mat_size);
+    viennacl::copy(result.g_pos_mult, pos_mult);
+
+    std::vector<unsigned int> blocks_mult_sum(mat_size);
+    viennacl::copy(result.g_blocks_mult_sum, blocks_mult_sum);
+
+    unsigned int num_one_intervals = result.g_num_one;
+    unsigned int sum_blocks_mult = mat_size - num_one_intervals;
+
+
+    // copy data for intervals that contained one eigenvalue after the first
+    // processing step
+    std::vector<NumericT> left_one(mat_size);
+    std::vector<NumericT> right_one(mat_size);
+    std::vector<unsigned int> pos_one(mat_size);
+
+    viennacl::copy(result.g_left_one, left_one);
+    viennacl::copy(result.g_right_one, right_one);
+    viennacl::copy(result.g_pos_one, pos_one);
+
+
+    // singleton intervals generated in the second step
+    for (unsigned int i = 0; i < sum_blocks_mult; ++i)
+    {
+      if (pos_mult[i] != 0)
+        result.std_eigenvalues[pos_mult[i] - 1] = lambda_mult[i];
+
+      else
+      {
+        throw memory_exception("Invalid array index! Are there more than 256 
equal eigenvalues?");
+      }
+    }
+
+    // singleton intervals generated in the first step
+    unsigned int index = 0;
+
+    for (unsigned int i = 0; i < num_one_intervals; ++i, ++index)
+    {
+        result.std_eigenvalues[pos_one[i] - 1] = left_one[i];
+    }
+    return bCompareResult;
+}
+} // namespace detail
+}  // namespace linalg
+}  // namespace viennacl
+#endif //VIENNACL_LINALG_DETAIL_BISECT_LARGE_HPP_

http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/detail/bisect/bisect_small.hpp
----------------------------------------------------------------------
diff --git 
a/native-viennaCL/src/main/cpp/viennacl/linalg/detail/bisect/bisect_small.hpp 
b/native-viennaCL/src/main/cpp/viennacl/linalg/detail/bisect/bisect_small.hpp
new file mode 100755
index 0000000..144640b
--- /dev/null
+++ 
b/native-viennaCL/src/main/cpp/viennacl/linalg/detail/bisect/bisect_small.hpp
@@ -0,0 +1,96 @@
+#ifndef VIENNACL_LINALG_DETAIL_BISECT_SMALL_HPP_
+#define VIENNACL_LINALG_DETAIL_BISECT_SMALL_HPP_
+
+/* =========================================================================
+   Copyright (c) 2010-2016, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   [email protected]
+
+   (A list of authors and contributors can be found in the manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= 
*/
+
+
+/** @file viennacl/linalg/detail//bisect/bisect_small.hpp
+    @brief Computation of eigenvalues of a small symmetric, tridiagonal matrix
+
+    Implementation based on the sample provided with the CUDA 6.0 SDK, for 
which
+    the creation of derivative works is allowed by including the following 
statement:
+    "This software contains source code provided by NVIDIA Corporation."
+*/
+
+// includes, system
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <math.h>
+#include <float.h>
+
+// includes, project
+
+#include "viennacl/linalg/detail/bisect/structs.hpp"
+
+// includes, kernels
+#include "viennacl/linalg/detail/bisect/bisect_kernel_calls.hpp"
+
+namespace viennacl
+{
+namespace linalg
+{
+namespace detail
+{
+////////////////////////////////////////////////////////////////////////////////
+//! Determine eigenvalues for matrices smaller than MAX_SMALL_MATRIX
+//! @param  input  handles to input data of kernel
+//! @param  result handles to result of kernel
+//! @param  mat_size  matrix size
+//! @param  lg  lower limit of Gerschgorin interval
+//! @param  ug  upper limit of Gerschgorin interval
+//! @param  precision  desired precision of eigenvalues
+////////////////////////////////////////////////////////////////////////////////
+template<typename NumericT>
+void
+computeEigenvaluesSmallMatrix(const InputData<NumericT> &input, 
ResultDataSmall<NumericT> &result,
+                              const unsigned int mat_size,
+                              const NumericT lg, const NumericT ug,
+                              const NumericT precision)
+{
+  viennacl::linalg::detail::bisectSmall( input, result, mat_size, lg, ug, 
precision);
+}
+
+
+////////////////////////////////////////////////////////////////////////////////
+//! Process the result obtained on the device, that is transfer to host and
+//! perform basic sanity checking
+//! @param  result  handles to result data
+//! @param  mat_size   matrix size
+////////////////////////////////////////////////////////////////////////////////
+template<typename NumericT>
+void
+processResultSmallMatrix(ResultDataSmall<NumericT> &result,
+                         const unsigned int mat_size)
+{
+  // copy data back to host
+  std::vector<NumericT> left(mat_size);
+  std::vector<unsigned int> left_count(mat_size);
+
+  viennacl::copy(result.vcl_g_left, left);
+  viennacl::copy(result.vcl_g_left_count, left_count);
+
+  for (unsigned int i = 0; i < mat_size; ++i)
+  {
+      result.std_eigenvalues[left_count[i]] = left[i];
+  }
+}
+}  // namespace detail
+}  // namespace linalg
+} // namespace viennacl
+#endif

http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/detail/bisect/config.hpp
----------------------------------------------------------------------
diff --git 
a/native-viennaCL/src/main/cpp/viennacl/linalg/detail/bisect/config.hpp 
b/native-viennaCL/src/main/cpp/viennacl/linalg/detail/bisect/config.hpp
new file mode 100755
index 0000000..3afa509
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/linalg/detail/bisect/config.hpp
@@ -0,0 +1,44 @@
+#ifndef VIENNACL_LINALG_DETAIL_BISECT_CONFIG_HPP_
+#define VIENNACL_LINALG_DETAIL_BISECT_CONFIG_HPP_
+
+/* =========================================================================
+   Copyright (c) 2010-2016, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   [email protected]
+
+   (A list of authors and contributors can be found in the manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= 
*/
+
+
+
+/** @file viennacl/linalg/detail//bisect/config.hpp
+ *     @brief Global configuration parameters
+ *
+ *         Implementation based on the sample provided with the CUDA 6.0 SDK, 
for which
+ *             the creation of derivative works is allowed by including the 
following statement:
+ *                 "This software contains source code provided by NVIDIA 
Corporation."
+ *                 */
+
+// should be power of two
+#define  VIENNACL_BISECT_MAX_THREADS_BLOCK                256
+
+#ifdef VIENNACL_WITH_OPENCL
+#  define VIENNACL_BISECT_MAX_SMALL_MATRIX                 256
+#  define VIENNACL_BISECT_MAX_THREADS_BLOCK_SMALL_MATRIX   256
+#else                                                          // if CUDA is 
used
+#  define VIENNACL_BISECT_MAX_THREADS_BLOCK_SMALL_MATRIX   512 // change to 
256 if errors occur
+#  define VIENNACL_BISECT_MAX_SMALL_MATRIX                 512 // change to 
256 if errors occur
+#endif
+
+ #define  VIENNACL_BISECT_MIN_ABS_INTERVAL                 5.0e-37
+
+#endif 

http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/detail/bisect/gerschgorin.hpp
----------------------------------------------------------------------
diff --git 
a/native-viennaCL/src/main/cpp/viennacl/linalg/detail/bisect/gerschgorin.hpp 
b/native-viennaCL/src/main/cpp/viennacl/linalg/detail/bisect/gerschgorin.hpp
new file mode 100755
index 0000000..53cd863
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/linalg/detail/bisect/gerschgorin.hpp
@@ -0,0 +1,94 @@
+#ifndef _VIENNACL_LINALG_DETAIL_BISECT_GERSCHORIN_HPP_
+#define _VIENNACL_LINALG_DETAIL_BISECT_GERSCHORIN_HPP_
+
+
+/* =========================================================================
+   Copyright (c) 2010-2016, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   [email protected]
+
+   (A list of authors and contributors can be found in the manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= 
*/
+
+
+/** @file viennacl/linalg/detail//bisect/gerschgorin.hpp
+    @brief  Computation of Gerschgorin interval for symmetric, tridiagonal 
matrix
+
+    Implementation based on the sample provided with the CUDA 6.0 SDK, for 
which
+    the creation of derivative works is allowed by including the following 
statement:
+    "This software contains source code provided by NVIDIA Corporation."
+*/
+
+#include <cstdio>
+#include <cstdlib>
+#include <cmath>
+#include <cfloat>
+
+#include "viennacl/linalg/detail/bisect/util.hpp"
+#include "viennacl/vector.hpp"
+
+namespace viennacl
+{
+namespace linalg
+{
+namespace detail
+{
+  
////////////////////////////////////////////////////////////////////////////////
+  //! Compute Gerschgorin interval for symmetric, tridiagonal matrix
+  //! @param  d  diagonal elements
+  //! @param  s  superdiagonal elements
+  //! @param  n  size of matrix
+  //! @param  lg  lower limit of Gerschgorin interval
+  //! @param  ug  upper limit of Gerschgorin interval
+  
////////////////////////////////////////////////////////////////////////////////
+  template<typename NumericT>
+  void
+  computeGerschgorin(std::vector<NumericT> & d, std::vector<NumericT> & s, 
unsigned int n, NumericT &lg, NumericT &ug)
+  {
+      // compute bounds
+      for (unsigned int i = 1; i < (n - 1); ++i)
+      {
+
+          // sum over the absolute values of all elements of row i
+          NumericT sum_abs_ni = fabsf(s[i]) + fabsf(s[i + 1]);
+
+          lg = min(lg, d[i] - sum_abs_ni);
+          ug = max(ug, d[i] + sum_abs_ni);
+      }
+
+      // first and last row, only one superdiagonal element
+
+      // first row
+      lg = min(lg, d[0] - fabsf(s[1]));
+      ug = max(ug, d[0] + fabsf(s[1]));
+
+      // last row
+      lg = min(lg, d[n-1] - fabsf(s[n-1]));
+      ug = max(ug, d[n-1] + fabsf(s[n-1]));
+
+      // increase interval to avoid side effects of fp arithmetic
+      NumericT bnorm = max(fabsf(ug), fabsf(lg));
+
+      // these values depend on the implmentation of floating count that is
+      // employed in the following
+      NumericT psi_0 = 11 * FLT_EPSILON * bnorm;
+      NumericT psi_n = 11 * FLT_EPSILON * bnorm;
+
+      lg = lg - bnorm * 2 * static_cast<NumericT>(n) * FLT_EPSILON - psi_0;
+      ug = ug + bnorm * 2 * static_cast<NumericT>(n) * FLT_EPSILON + psi_n;
+
+      ug = max(lg, ug);
+  }
+}  // namespace detail
+}  // namespace linalg
+} // namespace viennacl
+#endif  // _VIENNACL_LINALG_DETAIL_GERSCHORIN_H_

http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/detail/bisect/structs.hpp
----------------------------------------------------------------------
diff --git 
a/native-viennaCL/src/main/cpp/viennacl/linalg/detail/bisect/structs.hpp 
b/native-viennaCL/src/main/cpp/viennacl/linalg/detail/bisect/structs.hpp
new file mode 100755
index 0000000..1943da3
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/linalg/detail/bisect/structs.hpp
@@ -0,0 +1,182 @@
+#ifndef VIENNACL_LINALG_DETAIL_BISECT_STRUCTS_HPP_
+#define VIENNACL_LINALG_DETAIL_BISECT_STRUCTS_HPP_
+
+/* =========================================================================
+   Copyright (c) 2010-2016, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   [email protected]
+
+   (A list of authors and contributors can be found in the manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= 
*/
+
+
+/** @file viennacl/linalg/detail//bisect/structs.hpp
+    @brief  Helper structures to simplify variable handling
+
+    Implementation based on the sample provided with the CUDA 6.0 SDK, for 
which
+    the creation of derivative works is allowed by including the following 
statement:
+    "This software contains source code provided by NVIDIA Corporation."
+*/
+
+
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <math.h>
+#include <float.h>
+#include <assert.h>
+
+#include "viennacl/vector.hpp"
+#include "viennacl/matrix.hpp"
+
+namespace viennacl
+{
+namespace linalg
+{
+namespace detail
+{
+
+/////////////////////////////////////////////////////////////////////////////////
+//! In this class the input matrix is stored
+/////////////////////////////////////////////////////////////////////////////////
+template<typename NumericT>
+struct InputData
+{
+  //! host side representation of diagonal
+  std::vector<NumericT> std_a;
+  //! host side representation superdiagonal
+  std::vector<NumericT> std_b;
+  //! device side representation of diagonal
+  viennacl::vector<NumericT> g_a;
+  //!device side representation of superdiagonal
+  viennacl::vector<NumericT> g_b;
+
+  /** @brief Initialize the input data to the algorithm
+   *
+   * @param diagonal        vector with the diagonal elements
+   * @param superdiagonal   vector with the superdiagonal elements
+   * @param sz              size of the matrix
+   */
+  InputData(std::vector<NumericT> diagonal, std::vector<NumericT> 
superdiagonal, const unsigned int sz) :
+              std_a(sz), std_b(sz), g_a(sz), g_b(sz)
+  {
+   std_a = diagonal;
+   std_b = superdiagonal;
+
+   viennacl::copy(std_b, g_b);
+   viennacl::copy(std_a, g_a);
+  }
+
+  InputData(viennacl::vector<NumericT> diagonal, viennacl::vector<NumericT> 
superdiagonal, const unsigned int sz) :
+              std_a(sz), std_b(sz), g_a(sz), g_b(sz)
+  {
+   g_a = diagonal;
+   g_b = superdiagonal;
+
+   viennacl::copy(g_a, std_a);
+   viennacl::copy(g_b, std_b);
+  }
+};
+
+
+/////////////////////////////////////////////////////////////////////////////////
+//! In this class the data of the result for small matrices is stored
+/////////////////////////////////////////////////////////////////////////////////
+template<typename NumericT>
+struct ResultDataSmall
+{
+  //! eigenvalues (host side)
+  std::vector<NumericT> std_eigenvalues;
+  //! left interval limits at the end of the computation
+  viennacl::vector<NumericT> vcl_g_left;
+  //! right interval limits at the end of the computation
+  viennacl::vector<NumericT> vcl_g_right;
+  //! number of eigenvalues smaller than the left interval limit
+  viennacl::vector<unsigned int> vcl_g_left_count;
+  //! number of eigenvalues bigger than the right interval limit
+  viennacl::vector<unsigned int> vcl_g_right_count;
+
+
+  
////////////////////////////////////////////////////////////////////////////////
+  //! Initialize variables and memory for the result for small matrices
+  
////////////////////////////////////////////////////////////////////////////////
+  ResultDataSmall(const unsigned int mat_size) :
+    std_eigenvalues(mat_size), vcl_g_left(mat_size), vcl_g_right(mat_size), 
vcl_g_left_count(mat_size), vcl_g_right_count(mat_size) {}
+};
+
+
+
+
+
+/////////////////////////////////////////////////////////////////////////////////
+//! In this class the data of the result for large matrices is stored
+/////////////////////////////////////////////////////////////////////////////////
+template<typename NumericT>
+struct ResultDataLarge
+{
+//! eigenvalues
+  std::vector<NumericT> std_eigenvalues;
+
+  //! number of intervals containing one eigenvalue after the first step
+  viennacl::scalar<unsigned int> g_num_one;
+
+  //! number of (thread) blocks of intervals containing multiple eigenvalues 
after the first steo
+  viennacl::scalar<unsigned int> g_num_blocks_mult;
+
+  //! left interval limits of intervals containing one eigenvalue after the 
first iteration step
+  viennacl::vector<NumericT> g_left_one;
+
+  //! right interval limits of intervals containing one eigenvalue after the 
first iteration step
+  viennacl::vector<NumericT> g_right_one;
+
+  //! interval indices (position in sorted listed of eigenvalues) of intervals 
containing one eigenvalue after the first iteration step
+  viennacl::vector<unsigned int> g_pos_one;
+
+  //! left interval limits of intervals containing multiple eigenvalues after 
the first iteration step
+  viennacl::vector<NumericT> g_left_mult;
+  //! right interval limits of intervals containing multiple eigenvalues after 
the first iteration step
+  viennacl::vector<NumericT> g_right_mult;
+
+  //! number of eigenvalues less than the left limit of the eigenvalue 
intervals containing multiple eigenvalues
+  viennacl::vector<unsigned int> g_left_count_mult;
+
+  //! number of eigenvalues less than the right limit of the eigenvalue 
intervals containing multiple eigenvalues
+  viennacl::vector<unsigned int> g_right_count_mult;
+  //! start addresses in g_left_mult etc. of blocks of intervals containing 
more than one eigenvalue after the first step
+  viennacl::vector<unsigned int> g_blocks_mult;
+
+  //! accumulated number of intervals in g_left_mult etc. of blocks of 
intervals containing more than one eigenvalue after the first step
+  viennacl::vector<unsigned int> g_blocks_mult_sum;
+
+  //! eigenvalues that have been generated in the second step from intervals 
that still contained multiple eigenvalues after the first step
+  viennacl::vector<NumericT> g_lambda_mult;
+
+  //! eigenvalue index of intervals that have been generated in the second 
processing step
+  viennacl::vector<unsigned int> g_pos_mult;
+
+  /** @brief Initialize variables and memory for result
+   *
+   * @param  mat_size  size of the matrix
+   */
+  ResultDataLarge(unsigned int mat_size) :
+    std_eigenvalues(mat_size), g_num_one(0), g_num_blocks_mult(0),
+    g_left_one(mat_size), g_right_one(mat_size), g_pos_one(mat_size),
+    g_left_mult(mat_size), g_right_mult(mat_size), 
g_left_count_mult(mat_size), g_right_count_mult(mat_size),
+    g_blocks_mult(mat_size), g_blocks_mult_sum(mat_size), 
g_lambda_mult(mat_size), g_pos_mult(mat_size) {}
+
+};
+} // namespace detail
+} // namespace linalg
+} // namespace viennacl
+#endif // #ifndef VIENNACL_LINALG_DETAIL_STRUCTS_HPP_
+

http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/detail/bisect/util.hpp
----------------------------------------------------------------------
diff --git 
a/native-viennaCL/src/main/cpp/viennacl/linalg/detail/bisect/util.hpp 
b/native-viennaCL/src/main/cpp/viennacl/linalg/detail/bisect/util.hpp
new file mode 100755
index 0000000..883d202
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/linalg/detail/bisect/util.hpp
@@ -0,0 +1,106 @@
+#ifndef VIENNACL_LINALG_DETAIL_BISECT_UTIL_HPP_
+#define VIENNACL_LINALG_DETAIL_BISECT_UTIL_HPP_
+
+/* =========================================================================
+   Copyright (c) 2010-2016, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   [email protected]
+
+   (A list of authors and contributors can be found in the manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= 
*/
+
+
+/** @file viennacl/linalg/detail//bisect/util.hpp
+    @brief Utility functions
+
+    Implementation based on the sample provided with the CUDA 6.0 SDK, for 
which
+    the creation of derivative works is allowed by including the following 
statement:
+    "This software contains source code provided by NVIDIA Corporation."
+*/
+
+namespace viennacl
+{
+namespace linalg
+{
+namespace detail
+{
+
+////////////////////////////////////////////////////////////////////////////////
+//! Minimum
+////////////////////////////////////////////////////////////////////////////////
+template<class T>
+#ifdef __CUDACC__
+__host__  __device__
+#endif
+T
+min(const T &lhs, const T &rhs)
+{
+
+    return (lhs < rhs) ? lhs : rhs;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+//! Maximum
+////////////////////////////////////////////////////////////////////////////////
+template<class T>
+#ifdef __CUDACC__
+__host__  __device__
+#endif
+T
+max(const T &lhs, const T &rhs)
+{
+
+    return (lhs < rhs) ? rhs : lhs;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+//! Sign of number (float)
+////////////////////////////////////////////////////////////////////////////////
+#ifdef __CUDACC__
+__host__  __device__
+#endif
+inline float
+sign_f(const float &val)
+{
+    return (val < 0.0f) ? -1.0f : 1.0f;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+//! Sign of number (double)
+////////////////////////////////////////////////////////////////////////////////
+#ifdef __CUDACC__
+__host__  __device__
+#endif
+inline double
+sign_d(const double &val)
+{
+    return (val < 0.0) ? -1.0 : 1.0;
+}
+
+///////////////////////////////////////////////////////////////////////////////
+//! Get the number of blocks that are required to process \a num_threads with
+//! \a num_threads_blocks threads per block
+///////////////////////////////////////////////////////////////////////////////
+extern "C"
+inline
+unsigned int
+getNumBlocksLinear(const unsigned int num_threads,
+                   const unsigned int num_threads_block)
+{
+    const unsigned int block_rem =
+        ((num_threads % num_threads_block) != 0) ? 1 : 0;
+    return (num_threads / num_threads_block) + block_rem;
+}
+} // namespace detail
+} // namespace linalg
+} // namespace viennacl
+#endif // #ifndef VIENNACL_LINALG_DETAIL_UTIL_HPP_

http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/detail/ilu/block_ilu.hpp
----------------------------------------------------------------------
diff --git 
a/native-viennaCL/src/main/cpp/viennacl/linalg/detail/ilu/block_ilu.hpp 
b/native-viennaCL/src/main/cpp/viennacl/linalg/detail/ilu/block_ilu.hpp
new file mode 100644
index 0000000..1540e2d
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/linalg/detail/ilu/block_ilu.hpp
@@ -0,0 +1,617 @@
+#ifndef VIENNACL_LINALG_DETAIL_BLOCK_ILU_HPP_
+#define VIENNACL_LINALG_DETAIL_BLOCK_ILU_HPP_
+
+/* =========================================================================
+   Copyright (c) 2010-2016, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   [email protected]
+
+   (A list of authors and contributors can be found in the manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= 
*/
+
+/** @file viennacl/linalg/detail/ilu/block_ilu.hpp
+    @brief Implementations of incomplete block factorization preconditioners
+*/
+
+#include <vector>
+#include <cmath>
+#include "viennacl/forwards.h"
+#include "viennacl/tools/tools.hpp"
+#include "viennacl/linalg/detail/ilu/common.hpp"
+#include "viennacl/linalg/detail/ilu/ilu0.hpp"
+#include "viennacl/linalg/detail/ilu/ilut.hpp"
+
+#include <map>
+
+namespace viennacl
+{
+namespace linalg
+{
+namespace detail
+{
+  /** @brief Helper range class for representing a subvector of a larger 
buffer. */
+  template<typename VectorT, typename NumericT, typename SizeT = vcl_size_t>
+  class ilu_vector_range
+  {
+  public:
+    ilu_vector_range(VectorT & v,
+                     SizeT start_index,
+                     SizeT vec_size
+                    ) : vec_(v), start_(start_index), size_(vec_size) {}
+
+    NumericT & operator()(SizeT index)
+    {
+      assert(index < size_ && bool("Index out of bounds!"));
+      return vec_[start_ + index];
+    }
+
+    NumericT & operator[](SizeT index)
+    {
+      assert(index < size_ && bool("Index out of bounds!"));
+      return vec_[start_ + index];
+    }
+
+    SizeT size() const { return size_; }
+
+  private:
+    VectorT & vec_;
+    SizeT start_;
+    SizeT size_;
+  };
+
+  /** @brief Extracts a diagonal block from a larger system matrix
+    *
+    * @param A                   The full matrix
+    * @param diagonal_block_A    The output matrix, to which the extracted 
block is written to
+    * @param start_index         First row- and column-index of the block
+    * @param stop_index          First row- and column-index beyond the block
+    */
+  template<typename NumericT>
+  void extract_block_matrix(viennacl::compressed_matrix<NumericT> const & A,
+                            viennacl::compressed_matrix<NumericT>       & 
diagonal_block_A,
+                            vcl_size_t start_index,
+                            vcl_size_t stop_index
+                            )
+  {
+    assert( (A.handle1().get_active_handle_id() == viennacl::MAIN_MEMORY) && 
bool("System matrix must reside in main memory for ILU0") );
+    assert( (A.handle2().get_active_handle_id() == viennacl::MAIN_MEMORY) && 
bool("System matrix must reside in main memory for ILU0") );
+    assert( (A.handle().get_active_handle_id() == viennacl::MAIN_MEMORY) && 
bool("System matrix must reside in main memory for ILU0") );
+
+    NumericT     const * A_elements   = 
viennacl::linalg::host_based::detail::extract_raw_pointer<NumericT>(A.handle());
+    unsigned int const * A_row_buffer = 
viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned 
int>(A.handle1());
+    unsigned int const * A_col_buffer = 
viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned 
int>(A.handle2());
+
+    NumericT     * output_elements   = 
viennacl::linalg::host_based::detail::extract_raw_pointer<NumericT>(diagonal_block_A.handle());
+    unsigned int * output_row_buffer = 
viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned 
int>(diagonal_block_A.handle1());
+    unsigned int * output_col_buffer = 
viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned 
int>(diagonal_block_A.handle2());
+
+    vcl_size_t output_counter = 0;
+    for (vcl_size_t row = start_index; row < stop_index; ++row)
+    {
+      unsigned int buffer_col_start = A_row_buffer[row];
+      unsigned int buffer_col_end   = A_row_buffer[row+1];
+
+      output_row_buffer[row - start_index] = static_cast<unsigned 
int>(output_counter);
+
+      for (unsigned int buf_index = buffer_col_start; buf_index < 
buffer_col_end; ++buf_index)
+      {
+        unsigned int col = A_col_buffer[buf_index];
+        if (col < start_index)
+          continue;
+
+        if (col >= static_cast<unsigned int>(stop_index))
+          continue;
+
+        output_col_buffer[output_counter] = static_cast<unsigned int>(col - 
start_index);
+        output_elements[output_counter] = A_elements[buf_index];
+        ++output_counter;
+      }
+      output_row_buffer[row - start_index + 1] = static_cast<unsigned 
int>(output_counter);
+    }
+  }
+
+} // namespace detail
+
+
+
+/** @brief A block ILU preconditioner class, can be supplied to 
solve()-routines
+ *
+ * @tparam MatrixType   Type of the system matrix
+ * @tparam ILUTag       Type of the tag identifiying the ILU preconditioner to 
be used on each block.
+*/
+template<typename MatrixT, typename ILUTag>
+class block_ilu_precond
+{
+typedef typename MatrixT::value_type      ScalarType;
+
+public:
+  typedef std::vector<std::pair<vcl_size_t, vcl_size_t> >    
index_vector_type;   //the pair refers to index range [a, b) of each block
+
+
+  block_ilu_precond(MatrixT const & mat,
+                    ILUTag const & tag,
+                    vcl_size_t num_blocks = 8
+                   ) : tag_(tag), L_blocks(num_blocks), U_blocks(num_blocks)
+  {
+    // Set up vector of block indices:
+    block_indices_.resize(num_blocks);
+    for (vcl_size_t i=0; i<num_blocks; ++i)
+    {
+      vcl_size_t start_index = (   i  * mat.size1()) / num_blocks;
+      vcl_size_t stop_index  = ((i+1) * mat.size1()) / num_blocks;
+
+      block_indices_[i] = std::pair<vcl_size_t, vcl_size_t>(start_index, 
stop_index);
+    }
+
+    //initialize preconditioner:
+    //std::cout << "Start CPU precond" << std::endl;
+    init(mat);
+    //std::cout << "End CPU precond" << std::endl;
+  }
+
+  block_ilu_precond(MatrixT const & mat,
+                    ILUTag const & tag,
+                    index_vector_type const & block_boundaries
+                   ) : tag_(tag), block_indices_(block_boundaries), 
L_blocks(block_boundaries.size()), U_blocks(block_boundaries.size())
+  {
+    //initialize preconditioner:
+    //std::cout << "Start CPU precond" << std::endl;
+    init(mat);
+    //std::cout << "End CPU precond" << std::endl;
+  }
+
+
+  template<typename VectorT>
+  void apply(VectorT & vec) const
+  {
+    for (vcl_size_t i=0; i<block_indices_.size(); ++i)
+      apply_dispatch(vec, i, ILUTag());
+  }
+
+private:
+  void init(MatrixT const & A)
+  {
+    viennacl::context host_context(viennacl::MAIN_MEMORY);
+    viennacl::compressed_matrix<ScalarType> mat(host_context);
+
+    viennacl::copy(A, mat);
+
+    unsigned int const * row_buffer = 
viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned 
int>(mat.handle1());
+
+#ifdef VIENNACL_WITH_OPENMP
+    #pragma omp parallel for
+#endif
+    for (long i2=0; i2<static_cast<long>(block_indices_.size()); ++i2)
+    {
+      vcl_size_t i = static_cast<vcl_size_t>(i2);
+      // Step 1: Extract blocks
+      vcl_size_t block_size = block_indices_[i].second - 
block_indices_[i].first;
+      vcl_size_t block_nnz  = row_buffer[block_indices_[i].second] - 
row_buffer[block_indices_[i].first];
+      viennacl::compressed_matrix<ScalarType> mat_block(block_size, 
block_size, block_nnz, host_context);
+
+      detail::extract_block_matrix(mat, mat_block, block_indices_[i].first, 
block_indices_[i].second);
+
+      // Step 2: Precondition blocks:
+      viennacl::switch_memory_context(L_blocks[i], host_context);
+      viennacl::switch_memory_context(U_blocks[i], host_context);
+      init_dispatch(mat_block, L_blocks[i], U_blocks[i], tag_);
+    }
+
+  }
+
+  void init_dispatch(viennacl::compressed_matrix<ScalarType> const & mat_block,
+                     viennacl::compressed_matrix<ScalarType> & L,
+                     viennacl::compressed_matrix<ScalarType> & U,
+                     viennacl::linalg::ilu0_tag)
+  {
+    (void)U;
+    L = mat_block;
+    viennacl::linalg::precondition(L, tag_);
+  }
+
+  void init_dispatch(viennacl::compressed_matrix<ScalarType> const & mat_block,
+                     viennacl::compressed_matrix<ScalarType> & L,
+                     viennacl::compressed_matrix<ScalarType> & U,
+                     viennacl::linalg::ilut_tag)
+  {
+    L.resize(mat_block.size1(), mat_block.size2());
+    U.resize(mat_block.size1(), mat_block.size2());
+    viennacl::linalg::precondition(mat_block, L, U, tag_);
+  }
+
+  template<typename VectorT>
+  void apply_dispatch(VectorT & vec, vcl_size_t i, viennacl::linalg::ilu0_tag) 
const
+  {
+    detail::ilu_vector_range<VectorT, ScalarType> vec_range(vec, 
block_indices_[i].first, L_blocks[i].size2());
+
+    unsigned int const * row_buffer = 
viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned 
int>(L_blocks[i].handle1());
+    unsigned int const * col_buffer = 
viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned 
int>(L_blocks[i].handle2());
+    ScalarType   const * elements   = 
viennacl::linalg::host_based::detail::extract_raw_pointer<ScalarType>(L_blocks[i].handle());
+
+    
viennacl::linalg::host_based::detail::csr_inplace_solve<ScalarType>(row_buffer, 
col_buffer, elements, vec_range, L_blocks[i].size2(), unit_lower_tag());
+    
viennacl::linalg::host_based::detail::csr_inplace_solve<ScalarType>(row_buffer, 
col_buffer, elements, vec_range, L_blocks[i].size2(), upper_tag());
+  }
+
+  template<typename VectorT>
+  void apply_dispatch(VectorT & vec, vcl_size_t i, viennacl::linalg::ilut_tag) 
const
+  {
+    detail::ilu_vector_range<VectorT, ScalarType> vec_range(vec, 
block_indices_[i].first, L_blocks[i].size2());
+
+    {
+      unsigned int const * row_buffer = 
viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned 
int>(L_blocks[i].handle1());
+      unsigned int const * col_buffer = 
viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned 
int>(L_blocks[i].handle2());
+      ScalarType   const * elements   = 
viennacl::linalg::host_based::detail::extract_raw_pointer<ScalarType>(L_blocks[i].handle());
+
+      
viennacl::linalg::host_based::detail::csr_inplace_solve<ScalarType>(row_buffer, 
col_buffer, elements, vec_range, L_blocks[i].size2(), unit_lower_tag());
+    }
+
+    {
+      unsigned int const * row_buffer = 
viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned 
int>(U_blocks[i].handle1());
+      unsigned int const * col_buffer = 
viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned 
int>(U_blocks[i].handle2());
+      ScalarType   const * elements   = 
viennacl::linalg::host_based::detail::extract_raw_pointer<ScalarType>(U_blocks[i].handle());
+
+      
viennacl::linalg::host_based::detail::csr_inplace_solve<ScalarType>(row_buffer, 
col_buffer, elements, vec_range, U_blocks[i].size2(), upper_tag());
+    }
+  }
+
+  ILUTag tag_;
+  index_vector_type block_indices_;
+  std::vector< viennacl::compressed_matrix<ScalarType> > L_blocks;
+  std::vector< viennacl::compressed_matrix<ScalarType> > U_blocks;
+};
+
+
+
+
+
+/** @brief ILUT preconditioner class, can be supplied to solve()-routines.
+*
+*  Specialization for compressed_matrix
+*/
+template<typename NumericT, unsigned int AlignmentV, typename ILUTagT>
+class block_ilu_precond< compressed_matrix<NumericT, AlignmentV>, ILUTagT>
+{
+  typedef compressed_matrix<NumericT, AlignmentV>        MatrixType;
+
+public:
+  typedef std::vector<std::pair<vcl_size_t, vcl_size_t> >    
index_vector_type;   //the pair refers to index range [a, b) of each block
+
+
+  block_ilu_precond(MatrixType const & mat,
+                    ILUTagT const & tag,
+                    vcl_size_t num_blocks = 8
+                   ) : tag_(tag),
+                       block_indices_(num_blocks),
+                       gpu_block_indices_(),
+                       gpu_L_trans_(0, 0, 
viennacl::context(viennacl::MAIN_MEMORY)),
+                       gpu_U_trans_(0, 0, 
viennacl::context(viennacl::MAIN_MEMORY)),
+                       gpu_D_(mat.size1(), 
viennacl::context(viennacl::MAIN_MEMORY)),
+                       L_blocks_(num_blocks),
+                       U_blocks_(num_blocks)
+  {
+    // Set up vector of block indices:
+    block_indices_.resize(num_blocks);
+    for (vcl_size_t i=0; i<num_blocks; ++i)
+    {
+      vcl_size_t start_index = (   i  * mat.size1()) / num_blocks;
+      vcl_size_t stop_index  = ((i+1) * mat.size1()) / num_blocks;
+
+      block_indices_[i] = std::pair<vcl_size_t, vcl_size_t>(start_index, 
stop_index);
+    }
+
+    //initialize preconditioner:
+    //std::cout << "Start CPU precond" << std::endl;
+    init(mat);
+    //std::cout << "End CPU precond" << std::endl;
+  }
+
+  block_ilu_precond(MatrixType const & mat,
+                    ILUTagT const & tag,
+                    index_vector_type const & block_boundaries
+                   ) : tag_(tag),
+                       block_indices_(block_boundaries),
+                       gpu_block_indices_(),
+                       gpu_L_trans_(0, 0, 
viennacl::context(viennacl::MAIN_MEMORY)),
+                       gpu_U_trans_(0, 0, 
viennacl::context(viennacl::MAIN_MEMORY)),
+                       gpu_D_(mat.size1(), 
viennacl::context(viennacl::MAIN_MEMORY)),
+                       L_blocks_(block_boundaries.size()),
+                       U_blocks_(block_boundaries.size())
+  {
+    //initialize preconditioner:
+    //std::cout << "Start CPU precond" << std::endl;
+    init(mat);
+    //std::cout << "End CPU precond" << std::endl;
+  }
+
+
+  void apply(vector<NumericT> & vec) const
+  {
+    viennacl::linalg::detail::block_inplace_solve(trans(gpu_L_trans_), 
gpu_block_indices_, block_indices_.size(), gpu_D_,
+                                                  vec,
+                                                  
viennacl::linalg::unit_lower_tag());
+
+    viennacl::linalg::detail::block_inplace_solve(trans(gpu_U_trans_), 
gpu_block_indices_, block_indices_.size(), gpu_D_,
+                                                  vec,
+                                                  
viennacl::linalg::upper_tag());
+
+    //apply_cpu(vec);
+  }
+
+
+private:
+
+  void init(MatrixType const & A)
+  {
+    viennacl::context host_context(viennacl::MAIN_MEMORY);
+    viennacl::compressed_matrix<NumericT> mat(host_context);
+
+    mat = A;
+
+    unsigned int const * row_buffer = 
viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned 
int>(mat.handle1());
+
+#ifdef VIENNACL_WITH_OPENMP
+    #pragma omp parallel for
+#endif
+    for (long i=0; i<static_cast<long>(block_indices_.size()); ++i)
+    {
+      // Step 1: Extract blocks
+      vcl_size_t block_size = 
block_indices_[static_cast<vcl_size_t>(i)].second - 
block_indices_[static_cast<vcl_size_t>(i)].first;
+      vcl_size_t block_nnz  = 
row_buffer[block_indices_[static_cast<vcl_size_t>(i)].second] - 
row_buffer[block_indices_[static_cast<vcl_size_t>(i)].first];
+      viennacl::compressed_matrix<NumericT> mat_block(block_size, block_size, 
block_nnz, host_context);
+
+      detail::extract_block_matrix(mat, mat_block, 
block_indices_[static_cast<vcl_size_t>(i)].first, 
block_indices_[static_cast<vcl_size_t>(i)].second);
+
+      // Step 2: Precondition blocks:
+      viennacl::switch_memory_context(L_blocks_[static_cast<vcl_size_t>(i)], 
host_context);
+      viennacl::switch_memory_context(U_blocks_[static_cast<vcl_size_t>(i)], 
host_context);
+      init_dispatch(mat_block, L_blocks_[static_cast<vcl_size_t>(i)], 
U_blocks_[static_cast<vcl_size_t>(i)], tag_);
+    }
+
+    /*
+     * copy resulting preconditioner back to GPU:
+     */
+    viennacl::backend::typesafe_host_array<unsigned int> 
block_indices_uint(gpu_block_indices_, 2 * block_indices_.size());
+    for (vcl_size_t i=0; i<block_indices_.size(); ++i)
+    {
+      block_indices_uint.set(2*i,     block_indices_[i].first);
+      block_indices_uint.set(2*i + 1, block_indices_[i].second);
+    }
+
+    viennacl::backend::memory_create(gpu_block_indices_, 
block_indices_uint.raw_size(), viennacl::traits::context(A), 
block_indices_uint.get());
+
+    blocks_to_device(A);
+
+  }
+
+  // Copy computed preconditioned blocks to OpenCL device
+  void blocks_to_device(MatrixType const & A)
+  {
+    gpu_L_trans_.resize(A.size1(), A.size2());
+    gpu_U_trans_.resize(A.size1(), A.size2());
+    gpu_D_.resize(A.size1());
+
+    unsigned int * L_trans_row_buffer = 
viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned 
int>(gpu_L_trans_.handle1());
+    unsigned int * U_trans_row_buffer = 
viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned 
int>(gpu_U_trans_.handle1());
+
+    //
+    // Count elements per row
+    //
+#ifdef VIENNACL_WITH_OPENMP
+    #pragma omp parallel for
+#endif
+    for (long block_index2 = 0; block_index2 < 
static_cast<long>(L_blocks_.size()); ++block_index2)
+    {
+      vcl_size_t block_index = vcl_size_t(block_index2);
+
+      unsigned int block_start = static_cast<unsigned 
int>(block_indices_[block_index].first);
+      unsigned int block_stop  = static_cast<unsigned 
int>(block_indices_[block_index].second);
+
+      unsigned int const * L_row_buffer = 
viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned 
int>(L_blocks_[block_index].handle1());
+      unsigned int const * L_col_buffer = 
viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned 
int>(L_blocks_[block_index].handle2());
+
+      // zero row array of L:
+      std::fill(L_trans_row_buffer + block_start,
+                L_trans_row_buffer + block_stop,
+                static_cast<unsigned int>(0));
+
+      // count number of elements per row:
+      for (vcl_size_t row = 0; row < L_blocks_[block_index].size1(); ++row)
+      {
+        unsigned int col_start = L_row_buffer[row];
+        unsigned int col_end   = L_row_buffer[row+1];
+
+        for (unsigned int j = col_start; j < col_end; ++j)
+        {
+          unsigned int col = L_col_buffer[j];
+          if (col < static_cast<unsigned int>(row))
+            L_trans_row_buffer[col + block_start] += 1;
+        }
+      }
+
+      ////// same for U
+
+      unsigned int const * U_row_buffer = 
viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned 
int>(U_blocks_[block_index].handle1());
+      unsigned int const * U_col_buffer = 
viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned 
int>(U_blocks_[block_index].handle2());
+
+      // zero row array of U:
+      std::fill(U_trans_row_buffer + block_start,
+                U_trans_row_buffer + block_stop,
+                static_cast<unsigned int>(0));
+
+      // count number of elements per row:
+      for (vcl_size_t row = 0; row < U_blocks_[block_index].size1(); ++row)
+      {
+        unsigned int col_start = U_row_buffer[row];
+        unsigned int col_end   = U_row_buffer[row+1];
+
+        for (unsigned int j = col_start; j < col_end; ++j)
+        {
+          unsigned int col = U_col_buffer[j];
+          if (col > row)
+            U_trans_row_buffer[col + block_start] += 1;
+        }
+      }
+    }
+
+
+    //
+    // Exclusive scan on row buffer (feel free to add parallelization here)
+    //
+    unsigned int current_value = 0;
+    for (vcl_size_t i=0; i<gpu_L_trans_.size1(); ++i)
+    {
+      unsigned int tmp = L_trans_row_buffer[i];
+      L_trans_row_buffer[i] = current_value;
+      current_value += tmp;
+    }
+    gpu_L_trans_.reserve(current_value);
+
+    current_value = 0;
+    for (vcl_size_t i=0; i<gpu_U_trans_.size1(); ++i)
+    {
+      unsigned int tmp = U_trans_row_buffer[i];
+      U_trans_row_buffer[i] = current_value;
+      current_value += tmp;
+    }
+    gpu_U_trans_.reserve(current_value);
+
+
+    //
+    // Fill with data
+    //
+    unsigned int       * L_trans_col_buffer = 
viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned 
int>(gpu_L_trans_.handle2());
+    NumericT           * L_trans_elements   = 
viennacl::linalg::host_based::detail::extract_raw_pointer<NumericT>(gpu_L_trans_.handle());
+
+    unsigned int       * U_trans_col_buffer = 
viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned 
int>(gpu_U_trans_.handle2());
+    NumericT           * U_trans_elements   = 
viennacl::linalg::host_based::detail::extract_raw_pointer<NumericT>(gpu_U_trans_.handle());
+
+    NumericT           * D_elements         = 
viennacl::linalg::host_based::detail::extract_raw_pointer<NumericT>(gpu_D_.handle());
+
+    std::vector<unsigned int> offset_L(gpu_L_trans_.size1());
+    std::vector<unsigned int> offset_U(gpu_U_trans_.size1());
+
+#ifdef VIENNACL_WITH_OPENMP
+    #pragma omp parallel for
+#endif
+    for (long block_index2 = 0; block_index2 < 
static_cast<long>(L_blocks_.size()); ++block_index2)
+    {
+      vcl_size_t   block_index = vcl_size_t(block_index2);
+      unsigned int block_start = static_cast<unsigned 
int>(block_indices_[block_index].first);
+
+      unsigned int const * L_row_buffer = 
viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned 
int>(L_blocks_[block_index].handle1());
+      unsigned int const * L_col_buffer = 
viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned 
int>(L_blocks_[block_index].handle2());
+      NumericT     const * L_elements   = 
viennacl::linalg::host_based::detail::extract_raw_pointer<NumericT    
>(L_blocks_[block_index].handle());
+
+
+      // write L_trans:
+      for (vcl_size_t row = 0; row < L_blocks_[block_index].size1(); ++row)
+      {
+        unsigned int col_start = L_row_buffer[row];
+        unsigned int col_end   = L_row_buffer[row+1];
+
+        for (unsigned int j = col_start; j < col_end; ++j)
+        {
+          unsigned int col = L_col_buffer[j];
+          if (col < row)
+          {
+            unsigned int row_trans = col + block_start;
+            unsigned int k = L_trans_row_buffer[row_trans] + 
offset_L[row_trans];
+            offset_L[row_trans] += 1;
+
+            L_trans_col_buffer[k] = static_cast<unsigned int>(row) + 
block_start;
+            L_trans_elements[k]   = L_elements[j];
+          }
+        }
+      }
+
+      unsigned int const * U_row_buffer = 
viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned 
int>(U_blocks_[block_index].handle1());
+      unsigned int const * U_col_buffer = 
viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned 
int>(U_blocks_[block_index].handle2());
+      NumericT     const * U_elements   = 
viennacl::linalg::host_based::detail::extract_raw_pointer<NumericT    
>(U_blocks_[block_index].handle());
+
+      // write U_trans and D:
+      for (vcl_size_t row = 0; row < U_blocks_[block_index].size1(); ++row)
+      {
+        unsigned int col_start = U_row_buffer[row];
+        unsigned int col_end   = U_row_buffer[row+1];
+
+        for (unsigned int j = col_start; j < col_end; ++j)
+        {
+          unsigned int row_trans = U_col_buffer[j] + block_start;
+          unsigned int k = U_trans_row_buffer[row_trans] + offset_U[row_trans];
+
+          if (row_trans == row + block_start) // entry for D
+          {
+            D_elements[row_trans] = U_elements[j];
+          }
+          else if (row_trans > row + block_start) //entry for U
+          {
+            offset_U[row_trans] += 1;
+
+            U_trans_col_buffer[k] = static_cast<unsigned int>(row) + 
block_start;
+            U_trans_elements[k]   = U_elements[j];
+          }
+        }
+      }
+
+    }
+
+    //
+    // Send to destination device:
+    //
+    viennacl::switch_memory_context(gpu_L_trans_, 
viennacl::traits::context(A));
+    viennacl::switch_memory_context(gpu_U_trans_, 
viennacl::traits::context(A));
+    viennacl::switch_memory_context(gpu_D_,       
viennacl::traits::context(A));
+  }
+
+  void init_dispatch(viennacl::compressed_matrix<NumericT> const & mat_block,
+                     viennacl::compressed_matrix<NumericT> & L,
+                     viennacl::compressed_matrix<NumericT> & U,
+                     viennacl::linalg::ilu0_tag)
+  {
+    L = mat_block;
+    viennacl::linalg::precondition(L, tag_);
+    U = L; // fairly poor workaround...
+  }
+
+  void init_dispatch(viennacl::compressed_matrix<NumericT> const & mat_block,
+                     viennacl::compressed_matrix<NumericT> & L,
+                     viennacl::compressed_matrix<NumericT> & U,
+                     viennacl::linalg::ilut_tag)
+  {
+    L.resize(mat_block.size1(), mat_block.size2());
+    U.resize(mat_block.size1(), mat_block.size2());
+    viennacl::linalg::precondition(mat_block, L, U, tag_);
+  }
+
+
+  ILUTagT                               tag_;
+  index_vector_type                     block_indices_;
+  viennacl::backend::mem_handle         gpu_block_indices_;
+  viennacl::compressed_matrix<NumericT> gpu_L_trans_;
+  viennacl::compressed_matrix<NumericT> gpu_U_trans_;
+  viennacl::vector<NumericT>            gpu_D_;
+
+  std::vector<MatrixType> L_blocks_;
+  std::vector<MatrixType> U_blocks_;
+};
+
+
+}
+}
+
+
+
+
+#endif
+
+
+

http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/detail/ilu/chow_patel_ilu.hpp
----------------------------------------------------------------------
diff --git 
a/native-viennaCL/src/main/cpp/viennacl/linalg/detail/ilu/chow_patel_ilu.hpp 
b/native-viennaCL/src/main/cpp/viennacl/linalg/detail/ilu/chow_patel_ilu.hpp
new file mode 100644
index 0000000..7628cdb
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/linalg/detail/ilu/chow_patel_ilu.hpp
@@ -0,0 +1,316 @@
+#ifndef VIENNACL_LINALG_DETAIL_CHOW_PATEL_ILU_HPP_
+#define VIENNACL_LINALG_DETAIL_CHOW_PATEL_ILU_HPP_
+
+/* =========================================================================
+   Copyright (c) 2010-2016, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   [email protected]
+
+   (A list of authors and contributors can be found in the PDF manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= 
*/
+
+/** @file viennacl/linalg/detail/ilu/chow_patel_ilu.hpp
+  @brief Implementations of incomplete factorization preconditioners with 
fine-grained parallelism.
+
+  Based on "Fine-Grained Parallel Incomplete LU Factorization" by Chow and 
Patel, SIAM J. Sci. Comput., vol. 37, no. 2, pp. C169-C193
+*/
+
+#include <vector>
+#include <cmath>
+#include <iostream>
+#include "viennacl/forwards.h"
+#include "viennacl/tools/tools.hpp"
+#include "viennacl/linalg/detail/ilu/common.hpp"
+#include "viennacl/linalg/ilu_operations.hpp"
+#include "viennacl/linalg/prod.hpp"
+#include "viennacl/backend/memory.hpp"
+
+namespace viennacl
+{
+namespace linalg
+{
+
+/** @brief A tag for incomplete LU and incomplete Cholesky factorization with 
static pattern (Parallel-ILU0, Parallel ICC0)
+*/
+class chow_patel_tag
+{
+public:
+  /** @brief Constructor allowing to set the number of sweeps and Jacobi 
iterations.
+    *
+    * @param num_sweeps        Number of sweeps in setup phase
+    * @param num_jacobi_iters  Number of Jacobi iterations for each triangular 
'solve' when applying the preconditioner to a vector
+    */
+  chow_patel_tag(vcl_size_t num_sweeps = 3, vcl_size_t num_jacobi_iters = 2) : 
sweeps_(num_sweeps), jacobi_iters_(num_jacobi_iters) {}
+
+  /** @brief Returns the number of sweeps (i.e. number of nonlinear 
iterations) in the solver setup stage */
+  vcl_size_t sweeps() const { return sweeps_; }
+  /** @brief Sets the number of sweeps (i.e. number of nonlinear iterations) 
in the solver setup stage */
+  void       sweeps(vcl_size_t num) { sweeps_ = num; }
+
+  /** @brief Returns the number of Jacobi iterations (i.e. applications of 
x_{k+1} = (I - D^{-1}R)x_k + D^{-1} b) for each of the solves y = U^{-1} x and 
z = L^{-1} y) for each preconditioner application. */
+  vcl_size_t jacobi_iters() const { return jacobi_iters_; }
+  /** @brief Sets the number of Jacobi iterations for each triangular 'solve' 
when applying the preconditioner to a vector. */
+  void       jacobi_iters(vcl_size_t num) { jacobi_iters_ = num; }
+
+private:
+  vcl_size_t sweeps_;
+  vcl_size_t jacobi_iters_;
+};
+
+namespace detail
+{
+  /** @brief Implementation of the parallel ICC0 factorization, Algorithm 3 in 
Chow-Patel paper.
+   *
+   *  Rather than dealing with a column-major upper triangular matrix U, we 
use the lower-triangular matrix L such that A is approximately given by LL^T.
+   *  The advantage is that L is readily available in row-major format.
+   */
+  template<typename NumericT>
+  void precondition(viennacl::compressed_matrix<NumericT> const & A,
+                    viennacl::compressed_matrix<NumericT>       & L,
+                    viennacl::vector<NumericT>                  & diag_L,
+                    viennacl::compressed_matrix<NumericT>       & L_trans,
+                    chow_patel_tag const & tag)
+  {
+    // make sure L and U have correct dimensions:
+    L.resize(A.size1(), A.size2(), false);
+
+    // initialize L and U from values in A:
+    viennacl::linalg::extract_L(A, L);
+
+    // diagonally scale values from A in L:
+    viennacl::linalg::icc_scale(A, L);
+
+    viennacl::vector<NumericT> aij_L(L.nnz(), viennacl::traits::context(A));
+    viennacl::backend::memory_copy(L.handle(), aij_L.handle(), 0, 0, 
sizeof(NumericT) * L.nnz());
+
+    // run sweeps:
+    for (vcl_size_t i=0; i<tag.sweeps(); ++i)
+      viennacl::linalg::icc_chow_patel_sweep(L, aij_L);
+
+    // transpose L to obtain L_trans:
+    viennacl::linalg::ilu_transpose(L, L_trans);
+
+    // form (I - D_L^{-1}L) and (I - D_U^{-1} U), with U := L_trans
+    viennacl::linalg::ilu_form_neumann_matrix(L,       diag_L);
+    viennacl::linalg::ilu_form_neumann_matrix(L_trans, diag_L);
+  }
+
+
+  /** @brief Implementation of the parallel ILU0 factorization, Algorithm 2 in 
Chow-Patel paper. */
+  template<typename NumericT>
+  void precondition(viennacl::compressed_matrix<NumericT> const & A,
+                    viennacl::compressed_matrix<NumericT>       & L,
+                    viennacl::vector<NumericT>                  & diag_L,
+                    viennacl::compressed_matrix<NumericT>       & U,
+                    viennacl::vector<NumericT>                  & diag_U,
+                    chow_patel_tag const & tag)
+  {
+    // make sure L and U have correct dimensions:
+    L.resize(A.size1(), A.size2(), false);
+    U.resize(A.size1(), A.size2(), false);
+
+    // initialize L and U from values in A:
+    viennacl::linalg::extract_LU(A, L, U);
+
+    // diagonally scale values from A in L and U:
+    viennacl::linalg::ilu_scale(A, L, U);
+
+    // transpose storage layout of U from CSR to CSC via transposition
+    viennacl::compressed_matrix<NumericT> U_trans;
+    viennacl::linalg::ilu_transpose(U, U_trans);
+
+    // keep entries of a_ij for the sweeps
+    viennacl::vector<NumericT> aij_L      (L.nnz(),       
viennacl::traits::context(A));
+    viennacl::vector<NumericT> aij_U_trans(U_trans.nnz(), 
viennacl::traits::context(A));
+
+    viennacl::backend::memory_copy(      L.handle(), aij_L.handle(),       0, 
0, sizeof(NumericT) * L.nnz());
+    viennacl::backend::memory_copy(U_trans.handle(), aij_U_trans.handle(), 0, 
0, sizeof(NumericT) * U_trans.nnz());
+
+    // run sweeps:
+    for (vcl_size_t i=0; i<tag.sweeps(); ++i)
+      viennacl::linalg::ilu_chow_patel_sweep(L, aij_L, U_trans, aij_U_trans);
+
+    // transpose U_trans back:
+    viennacl::linalg::ilu_transpose(U_trans, U);
+
+    // form (I - D_L^{-1}L) and (I - D_U^{-1} U)
+    viennacl::linalg::ilu_form_neumann_matrix(L, diag_L);
+    viennacl::linalg::ilu_form_neumann_matrix(U, diag_U);
+  }
+
+}
+
+
+
+
+/** @brief Parallel Chow-Patel ILU preconditioner class, can be supplied to 
solve()-routines
+*/
+template<typename MatrixT>
+class chow_patel_icc_precond
+{
+  // only works with compressed_matrix!
+  typedef typename MatrixT::CHOW_PATEL_ICC_ONLY_WORKS_WITH_COMPRESSED_MATRIX  
error_type;
+};
+
+
+/** @brief Parallel Chow-Patel ILU preconditioner class, can be supplied to 
solve()-routines.
+*
+*  Specialization for compressed_matrix
+*/
+template<typename NumericT, unsigned int AlignmentV>
+class chow_patel_icc_precond< viennacl::compressed_matrix<NumericT, 
AlignmentV> >
+{
+
+public:
+  chow_patel_icc_precond(viennacl::compressed_matrix<NumericT, AlignmentV> 
const & A, chow_patel_tag const & tag)
+    : tag_(tag),
+      L_(0, 0, 0, viennacl::traits::context(A)),
+      diag_L_(A.size1(), viennacl::traits::context(A)),
+      L_trans_(0, 0, 0, viennacl::traits::context(A)),
+      x_k_(A.size1(), viennacl::traits::context(A)),
+      b_(A.size1(), viennacl::traits::context(A))
+  {
+    viennacl::linalg::detail::precondition(A, L_, diag_L_, L_trans_, tag_);
+  }
+
+  /** @brief Preconditioner application: LL^Tx = b, computed via Ly = b, L^Tx 
= y using Jacobi iterations.
+    *
+    * L contains (I - D_L^{-1}L), L_trans contains (I - D_L^{-1}L^T) where D 
denotes the respective diagonal matrix
+    */
+  template<typename VectorT>
+  void apply(VectorT & vec) const
+  {
+    //
+    // y = L^{-1} b through Jacobi iteration y_{k+1} = (I - D^{-1}L)y_k + 
D^{-1}x
+    //
+    b_ = viennacl::linalg::element_div(vec, diag_L_);
+    x_k_ = b_;
+    for (unsigned int i=0; i<tag_.jacobi_iters(); ++i)
+    {
+      vec = viennacl::linalg::prod(L_, x_k_);
+      x_k_ = vec + b_;
+    }
+
+    //
+    // x = U^{-1} y through Jacobi iteration x_{k+1} = (I - D^{-1}L^T)x_k + 
D^{-1}b
+    //
+    b_ = viennacl::linalg::element_div(x_k_, diag_L_);
+    x_k_ = b_; // x_1 if x_0 \equiv 0
+    for (unsigned int i=0; i<tag_.jacobi_iters(); ++i)
+    {
+      vec = viennacl::linalg::prod(L_trans_, x_k_);
+      x_k_ = vec + b_;
+    }
+
+    // return result:
+    vec = x_k_;
+  }
+
+private:
+  chow_patel_tag                          tag_;
+  viennacl::compressed_matrix<NumericT>   L_;
+  viennacl::vector<NumericT>              diag_L_;
+  viennacl::compressed_matrix<NumericT>   L_trans_;
+
+  mutable viennacl::vector<NumericT>      x_k_;
+  mutable viennacl::vector<NumericT>      b_;
+};
+
+
+
+
+
+
+/** @brief Parallel Chow-Patel ILU preconditioner class, can be supplied to 
solve()-routines
+*/
+template<typename MatrixT>
+class chow_patel_ilu_precond
+{
+  // only works with compressed_matrix!
+  typedef typename MatrixT::CHOW_PATEL_ILU_ONLY_WORKS_WITH_COMPRESSED_MATRIX  
error_type;
+};
+
+
+/** @brief Parallel Chow-Patel ILU preconditioner class, can be supplied to 
solve()-routines.
+*
+*  Specialization for compressed_matrix
+*/
+template<typename NumericT, unsigned int AlignmentV>
+class chow_patel_ilu_precond< viennacl::compressed_matrix<NumericT, 
AlignmentV> >
+{
+
+public:
+  chow_patel_ilu_precond(viennacl::compressed_matrix<NumericT, AlignmentV> 
const & A, chow_patel_tag const & tag)
+    : tag_(tag),
+      L_(0, 0, 0, viennacl::traits::context(A)),
+      diag_L_(A.size1(), viennacl::traits::context(A)),
+      U_(0, 0, 0, viennacl::traits::context(A)),
+      diag_U_(A.size1(), viennacl::traits::context(A)),
+      x_k_(A.size1(), viennacl::traits::context(A)),
+      b_(A.size1(), viennacl::traits::context(A))
+  {
+    viennacl::linalg::detail::precondition(A, L_, diag_L_, U_, diag_U_, tag_);
+  }
+
+  /** @brief Preconditioner application: LUx = b, computed via Ly = b, Ux = y 
using Jacobi iterations.
+    *
+    * L_ contains (I - D_L^{-1}L), U_ contains (I - D_U^{-1}U) where D denotes 
the respective diagonal matrix
+    */
+  template<typename VectorT>
+  void apply(VectorT & vec) const
+  {
+    //
+    // y = L^{-1} b through Jacobi iteration y_{k+1} = (I - D^{-1}L)y_k + 
D^{-1}x
+    //
+    b_ = viennacl::linalg::element_div(vec, diag_L_);
+    x_k_ = b_;
+    for (unsigned int i=0; i<tag_.jacobi_iters(); ++i)
+    {
+      vec = viennacl::linalg::prod(L_, x_k_);
+      x_k_ = vec + b_;
+    }
+
+    //
+    // x = U^{-1} y through Jacobi iteration x_{k+1} = (I - D^{-1}U)x_k + 
D^{-1}b
+    //
+    b_ = viennacl::linalg::element_div(x_k_, diag_U_);
+    x_k_ = b_; // x_1 if x_0 \equiv 0
+    for (unsigned int i=0; i<tag_.jacobi_iters(); ++i)
+    {
+      vec = viennacl::linalg::prod(U_, x_k_);
+      x_k_ = vec + b_;
+    }
+
+    // return result:
+    vec = x_k_;
+  }
+
+private:
+  chow_patel_tag                          tag_;
+  viennacl::compressed_matrix<NumericT>   L_;
+  viennacl::vector<NumericT>              diag_L_;
+  viennacl::compressed_matrix<NumericT>   U_;
+  viennacl::vector<NumericT>              diag_U_;
+
+  mutable viennacl::vector<NumericT>      x_k_;
+  mutable viennacl::vector<NumericT>      b_;
+};
+
+
+} // namespace linalg
+} // namespace viennacl
+
+
+#endif
+
+
+

Reply via email to