http://git-wip-us.apache.org/repos/asf/mahout/blob/7ae549fa/native-viennaCL/src/main/cpp/viennacl/linalg/host_based/spgemm_vector.hpp
----------------------------------------------------------------------
diff --git 
a/native-viennaCL/src/main/cpp/viennacl/linalg/host_based/spgemm_vector.hpp 
b/native-viennaCL/src/main/cpp/viennacl/linalg/host_based/spgemm_vector.hpp
deleted file mode 100644
index 56e3c14..0000000
--- a/native-viennaCL/src/main/cpp/viennacl/linalg/host_based/spgemm_vector.hpp
+++ /dev/null
@@ -1,705 +0,0 @@
-#ifndef VIENNACL_LINALG_HOST_BASED_SPGEMM_VECTOR_HPP_
-#define VIENNACL_LINALG_HOST_BASED_SPGEMM_VECTOR_HPP_
-
-/* =========================================================================
-   Copyright (c) 2010-2016, Institute for Microelectronics,
-                            Institute for Analysis and Scientific Computing,
-                            TU Wien.
-   Portions of this software are copyright by UChicago Argonne, LLC.
-
-                            -----------------
-                  ViennaCL - The Vienna Computing Library
-                            -----------------
-
-   Project Head:    Karl Rupp                   [email protected]
-
-   (A list of authors and contributors can be found in the manual)
-
-   License:         MIT (X11), see file LICENSE in the base directory
-============================================================================= 
*/
-
-/** @file viennacl/linalg/host_based/sparse_matrix_operations.hpp
-    @brief Implementations of operations using sparse matrices on the CPU 
using a single thread or OpenMP.
-*/
-
-#include "viennacl/forwards.h"
-#include "viennacl/linalg/host_based/common.hpp"
-
-
-#ifdef VIENNACL_WITH_AVX2
-#include "immintrin.h"
-#endif
-
-
-namespace viennacl
-{
-namespace linalg
-{
-namespace host_based
-{
-
-
-
-#ifdef VIENNACL_WITH_AVX2
-inline
-unsigned int row_C_scan_symbolic_vector_AVX2(int const *row_indices_B_begin, 
int const *row_indices_B_end,
-                                             int const *B_row_buffer, int 
const *B_col_buffer, int B_size2,
-                                             int *row_C_vector_output)
-{
-  __m256i avx_all_ones    = _mm256_set_epi32(1, 1, 1, 1, 1, 1, 1, 1);
-  __m256i avx_all_bsize2  = _mm256_set_epi32(B_size2, B_size2, B_size2, 
B_size2, B_size2, B_size2, B_size2, B_size2);
-
-  __m256i avx_row_indices_offsets = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
-  __m256i avx_load_mask = _mm256_sub_epi32(avx_row_indices_offsets, 
_mm256_set1_epi32(row_indices_B_end - row_indices_B_begin));
-  __m256i avx_load_mask2 = avx_load_mask;
-
-  __m256i avx_row_indices = _mm256_set1_epi32(0);
-          avx_row_indices = _mm256_mask_i32gather_epi32(avx_row_indices, 
row_indices_B_begin, avx_row_indices_offsets, avx_load_mask, 4);
-            avx_load_mask = avx_load_mask2; // reload mask (destroyed by 
gather)
-  __m256i avx_row_start   = _mm256_mask_i32gather_epi32(avx_all_ones, 
B_row_buffer,   avx_row_indices, avx_load_mask, 4);
-            avx_load_mask = avx_load_mask2; // reload mask (destroyed by 
gather)
-  __m256i avx_row_end     = _mm256_mask_i32gather_epi32(avx_all_ones, 
B_row_buffer+1, avx_row_indices, avx_load_mask, 4);
-
-          avx_load_mask   = _mm256_cmpgt_epi32(avx_row_end, avx_row_start);
-  __m256i avx_index_front = avx_all_bsize2;
-  avx_index_front         = _mm256_mask_i32gather_epi32(avx_index_front, 
B_col_buffer, avx_row_start, avx_load_mask, 4);
-
-  int *output_ptr = row_C_vector_output;
-
-  while (1)
-  {
-    // get minimum index in current front:
-    __m256i avx_index_min1 = avx_index_front;
-    __m256i avx_temp       = _mm256_permutevar8x32_epi32(avx_index_min1, 
_mm256_set_epi32(3, 2, 1, 0, 7, 6, 5, 4));
-    avx_index_min1 = _mm256_min_epi32(avx_index_min1, avx_temp); // first four 
elements compared against last four elements
-
-    avx_temp       = _mm256_shuffle_epi32(avx_index_min1, int(78));    // 
0b01001110 = 78, using shuffle instead of permutevar here because of lower 
latency
-    avx_index_min1 = _mm256_min_epi32(avx_index_min1, avx_temp); // first two 
elements compared against elements three and four (same for upper half of 
register)
-
-    avx_temp       = _mm256_shuffle_epi32(avx_index_min1, int(177));    // 
0b10110001 = 177, using shuffle instead of permutevar here because of lower 
latency
-    avx_index_min1 = _mm256_min_epi32(avx_index_min1, avx_temp); // now all 
entries of avx_index_min1 hold the minimum
-
-    int min_index_in_front = ((int*)&avx_index_min1)[0];
-    // check for end of merge operation:
-    if (min_index_in_front == B_size2)
-      break;
-
-    // write current entry:
-    *output_ptr = min_index_in_front;
-    ++output_ptr;
-
-    // advance index front where equal to minimum index:
-    avx_load_mask   = _mm256_cmpeq_epi32(avx_index_front, avx_index_min1);
-    // first part: set index to B_size2 if equal to minimum index:
-    avx_temp        = _mm256_and_si256(avx_all_bsize2, avx_load_mask);
-    avx_index_front = _mm256_max_epi32(avx_index_front, avx_temp);
-    // second part: increment row_start registers where minimum found:
-    avx_temp        = _mm256_and_si256(avx_all_ones, avx_load_mask); //ones 
only where the minimum was found
-    avx_row_start   = _mm256_add_epi32(avx_row_start, avx_temp);
-    // third part part: load new data where more entries available:
-    avx_load_mask   = _mm256_cmpgt_epi32(avx_row_end, avx_row_start);
-    avx_index_front = _mm256_mask_i32gather_epi32(avx_index_front, 
B_col_buffer, avx_row_start, avx_load_mask, 4);
-  }
-
-  return static_cast<unsigned int>(output_ptr - row_C_vector_output);
-}
-#endif
-
-/** @brief Merges up to IndexNum rows from B into the result buffer.
-*
-* Because the input buffer also needs to be considered, this routine actually 
works on an index front of length (IndexNum+1)
-**/
-template<unsigned int IndexNum>
-unsigned int row_C_scan_symbolic_vector_N(unsigned int const *row_indices_B,
-                                          unsigned int const *B_row_buffer, 
unsigned int const *B_col_buffer, unsigned int B_size2,
-                                          unsigned int const 
*row_C_vector_input, unsigned int const *row_C_vector_input_end,
-                                          unsigned int *row_C_vector_output)
-{
-  unsigned int index_front[IndexNum+1];
-  unsigned int const *index_front_start[IndexNum+1];
-  unsigned int const *index_front_end[IndexNum+1];
-
-  // Set up pointers for loading the indices:
-  for (unsigned int i=0; i<IndexNum; ++i, ++row_indices_B)
-  {
-    index_front_start[i] = B_col_buffer + B_row_buffer[*row_indices_B];
-    index_front_end[i]   = B_col_buffer + B_row_buffer[*row_indices_B + 1];
-  }
-  index_front_start[IndexNum] = row_C_vector_input;
-  index_front_end[IndexNum]   = row_C_vector_input_end;
-
-  // load indices:
-  for (unsigned int i=0; i<=IndexNum; ++i)
-    index_front[i] = (index_front_start[i] < index_front_end[i]) ? 
*index_front_start[i] : B_size2;
-
-  unsigned int *output_ptr = row_C_vector_output;
-
-  while (1)
-  {
-    // get minimum index in current front:
-    unsigned int min_index_in_front = B_size2;
-    for (unsigned int i=0; i<=IndexNum; ++i)
-      min_index_in_front = std::min(min_index_in_front, index_front[i]);
-
-    if (min_index_in_front == B_size2) // we're done
-      break;
-
-    // advance index front where equal to minimum index:
-    for (unsigned int i=0; i<=IndexNum; ++i)
-    {
-      if (index_front[i] == min_index_in_front)
-      {
-        index_front_start[i] += 1;
-        index_front[i] = (index_front_start[i] < index_front_end[i]) ? 
*index_front_start[i] : B_size2;
-      }
-    }
-
-    // write current entry:
-    *output_ptr = min_index_in_front;
-    ++output_ptr;
-  }
-
-  return static_cast<unsigned int>(output_ptr - row_C_vector_output);
-}
-
-struct spgemm_output_write_enabled  { static void apply(unsigned int *ptr, 
unsigned int value) { *ptr = value; } };
-struct spgemm_output_write_disabled { static void apply(unsigned int *   , 
unsigned int      ) {               } };
-
-template<typename OutputWriterT>
-unsigned int row_C_scan_symbolic_vector_1(unsigned int const *input1_begin, 
unsigned int const *input1_end,
-                                          unsigned int const *input2_begin, 
unsigned int const *input2_end,
-                                          unsigned int termination_index,
-                                          unsigned int *output_begin)
-{
-  unsigned int *output_ptr = output_begin;
-
-  unsigned int val_1 = (input1_begin < input1_end) ? *input1_begin : 
termination_index;
-  unsigned int val_2 = (input2_begin < input2_end) ? *input2_begin : 
termination_index;
-  while (1)
-  {
-    unsigned int min_index = std::min(val_1, val_2);
-
-    if (min_index == termination_index)
-      break;
-
-    if (min_index == val_1)
-    {
-      ++input1_begin;
-      val_1 = (input1_begin < input1_end) ? *input1_begin : termination_index;
-    }
-
-    if (min_index == val_2)
-    {
-      ++input2_begin;
-      val_2 = (input2_begin < input2_end) ? *input2_begin : termination_index;
-    }
-
-    // write current entry:
-    OutputWriterT::apply(output_ptr, min_index); // *output_ptr = min_index;   
 if necessary
-    ++output_ptr;
-  }
-
-  return static_cast<unsigned int>(output_ptr - output_begin);
-}
-
-inline
-unsigned int row_C_scan_symbolic_vector(unsigned int row_start_A, unsigned int 
row_end_A, unsigned int const *A_col_buffer,
-                                        unsigned int const *B_row_buffer, 
unsigned int const *B_col_buffer, unsigned int B_size2,
-                                        unsigned int *row_C_vector_1, unsigned 
int *row_C_vector_2, unsigned int *row_C_vector_3)
-{
-  // Trivial case: row length 0:
-  if (row_start_A == row_end_A)
-    return 0;
-
-  // Trivial case: row length 1:
-  if (row_end_A - row_start_A == 1)
-  {
-    unsigned int A_col = A_col_buffer[row_start_A];
-    return B_row_buffer[A_col + 1] - B_row_buffer[A_col];
-  }
-
-  // Optimizations for row length 2:
-  unsigned int row_C_len = 0;
-  if (row_end_A - row_start_A == 2)
-  {
-    unsigned int A_col_1 = A_col_buffer[row_start_A];
-    unsigned int A_col_2 = A_col_buffer[row_start_A + 1];
-    return 
row_C_scan_symbolic_vector_1<spgemm_output_write_disabled>(B_col_buffer + 
B_row_buffer[A_col_1], B_col_buffer + B_row_buffer[A_col_1 + 1],
-                                                                      
B_col_buffer + B_row_buffer[A_col_2], B_col_buffer + B_row_buffer[A_col_2 + 1],
-                                                                      B_size2,
-                                                                      
row_C_vector_1);
-  }
-  else // for more than two rows we can safely merge the first two:
-  {
-#ifdef VIENNACL_WITH_AVX2
-    row_C_len = row_C_scan_symbolic_vector_AVX2((const int*)(A_col_buffer + 
row_start_A), (const int*)(A_col_buffer + row_end_A),
-                                                (const int*)B_row_buffer, 
(const int*)B_col_buffer, int(B_size2),
-                                                (int*)row_C_vector_1);
-    row_start_A += 8;
-#else
-    unsigned int A_col_1 = A_col_buffer[row_start_A];
-    unsigned int A_col_2 = A_col_buffer[row_start_A + 1];
-    row_C_len =  
row_C_scan_symbolic_vector_1<spgemm_output_write_enabled>(B_col_buffer + 
B_row_buffer[A_col_1], B_col_buffer + B_row_buffer[A_col_1 + 1],
-                                                                           
B_col_buffer + B_row_buffer[A_col_2], B_col_buffer + B_row_buffer[A_col_2 + 1],
-                                                                           
B_size2,
-                                                                           
row_C_vector_1);
-    row_start_A += 2;
-#endif
-  }
-
-  // all other row lengths:
-  while (row_end_A > row_start_A)
-  {
-#ifdef VIENNACL_WITH_AVX2
-    if (row_end_A - row_start_A > 2) // we deal with one or two remaining rows 
more efficiently below:
-    {
-      unsigned int merged_len = row_C_scan_symbolic_vector_AVX2((const 
int*)(A_col_buffer + row_start_A), (const int*)(A_col_buffer + row_end_A),
-                                                                (const 
int*)B_row_buffer, (const int*)B_col_buffer, int(B_size2),
-                                                                
(int*)row_C_vector_3);
-      if (row_start_A + 8 >= row_end_A)
-        row_C_len = 
row_C_scan_symbolic_vector_1<spgemm_output_write_disabled>(row_C_vector_3, 
row_C_vector_3 + merged_len,
-                                                                              
row_C_vector_1, row_C_vector_1 + row_C_len,
-                                                                              
B_size2,
-                                                                              
row_C_vector_2);
-      else
-        row_C_len = 
row_C_scan_symbolic_vector_1<spgemm_output_write_enabled>(row_C_vector_3, 
row_C_vector_3 + merged_len,
-                                                                               
row_C_vector_1, row_C_vector_1 + row_C_len,
-                                                                               
B_size2,
-                                                                               
row_C_vector_2);
-      row_start_A += 8;
-    }
-    else
-#endif
-    if (row_start_A == row_end_A - 1) // last merge operation. No need to 
write output
-    {
-      // process last row
-      unsigned int row_index_B = A_col_buffer[row_start_A];
-      return 
row_C_scan_symbolic_vector_1<spgemm_output_write_disabled>(B_col_buffer + 
B_row_buffer[row_index_B], B_col_buffer + B_row_buffer[row_index_B + 1],
-                                                                        
row_C_vector_1, row_C_vector_1 + row_C_len,
-                                                                        
B_size2,
-                                                                        
row_C_vector_2);
-    }
-    else if (row_start_A + 1 < row_end_A)// at least two more rows left, so 
merge them
-    {
-      // process single row:
-      unsigned int A_col_1 = A_col_buffer[row_start_A];
-      unsigned int A_col_2 = A_col_buffer[row_start_A + 1];
-      unsigned int merged_len =  
row_C_scan_symbolic_vector_1<spgemm_output_write_enabled>(B_col_buffer + 
B_row_buffer[A_col_1], B_col_buffer + B_row_buffer[A_col_1 + 1],
-                                                                               
            B_col_buffer + B_row_buffer[A_col_2], B_col_buffer + 
B_row_buffer[A_col_2 + 1],
-                                                                               
            B_size2,
-                                                                               
            row_C_vector_3);
-      if (row_start_A + 2 == row_end_A) // last merge does not need a write:
-        return 
row_C_scan_symbolic_vector_1<spgemm_output_write_disabled>(row_C_vector_3, 
row_C_vector_3 + merged_len,
-                                                                          
row_C_vector_1, row_C_vector_1 + row_C_len,
-                                                                          
B_size2,
-                                                                          
row_C_vector_2);
-      else
-        row_C_len = 
row_C_scan_symbolic_vector_1<spgemm_output_write_enabled>(row_C_vector_3, 
row_C_vector_3 + merged_len,
-                                                                              
row_C_vector_1, row_C_vector_1 + row_C_len,
-                                                                              
B_size2,
-                                                                              
row_C_vector_2);
-      row_start_A += 2;
-    }
-    else // at least two more rows left
-    {
-      // process single row:
-      unsigned int row_index_B = A_col_buffer[row_start_A];
-      row_C_len = 
row_C_scan_symbolic_vector_1<spgemm_output_write_enabled>(B_col_buffer + 
B_row_buffer[row_index_B], B_col_buffer + B_row_buffer[row_index_B + 1],
-                                                                            
row_C_vector_1, row_C_vector_1 + row_C_len,
-                                                                            
B_size2,
-                                                                            
row_C_vector_2);
-      ++row_start_A;
-    }
-
-    std::swap(row_C_vector_1, row_C_vector_2);
-  }
-
-  return row_C_len;
-}
-
-//////////////////////////////
-
-/** @brief Merges up to IndexNum rows from B into the result buffer.
-*
-* Because the input buffer also needs to be considered, this routine actually 
works on an index front of length (IndexNum+1)
-**/
-template<unsigned int IndexNum, typename NumericT>
-unsigned int row_C_scan_numeric_vector_N(unsigned int const *row_indices_B, 
NumericT const *val_A,
-                                          unsigned int const *B_row_buffer, 
unsigned int const *B_col_buffer, NumericT const *B_elements, unsigned int 
B_size2,
-                                          unsigned int const 
*row_C_vector_input, unsigned int const *row_C_vector_input_end, NumericT 
*row_C_vector_input_values,
-                                          unsigned int *row_C_vector_output, 
NumericT *row_C_vector_output_values)
-{
-  unsigned int index_front[IndexNum+1];
-  unsigned int const *index_front_start[IndexNum+1];
-  unsigned int const *index_front_end[IndexNum+1];
-  NumericT const * value_front_start[IndexNum+1];
-  NumericT values_A[IndexNum+1];
-
-  // Set up pointers for loading the indices:
-  for (unsigned int i=0; i<IndexNum; ++i, ++row_indices_B)
-  {
-    unsigned int row_B = *row_indices_B;
-
-    index_front_start[i] = B_col_buffer + B_row_buffer[row_B];
-    index_front_end[i]   = B_col_buffer + B_row_buffer[row_B + 1];
-    value_front_start[i] = B_elements   + B_row_buffer[row_B];
-    values_A[i]          = val_A[i];
-  }
-  index_front_start[IndexNum] = row_C_vector_input;
-  index_front_end[IndexNum]   = row_C_vector_input_end;
-  value_front_start[IndexNum] = row_C_vector_input_values;
-  values_A[IndexNum]          = NumericT(1);
-
-  // load indices:
-  for (unsigned int i=0; i<=IndexNum; ++i)
-    index_front[i] = (index_front_start[i] < index_front_end[i]) ? 
*index_front_start[i] : B_size2;
-
-  unsigned int *output_ptr = row_C_vector_output;
-
-  while (1)
-  {
-    // get minimum index in current front:
-    unsigned int min_index_in_front = B_size2;
-    for (unsigned int i=0; i<=IndexNum; ++i)
-      min_index_in_front = std::min(min_index_in_front, index_front[i]);
-
-    if (min_index_in_front == B_size2) // we're done
-      break;
-
-    // advance index front where equal to minimum index:
-    NumericT row_C_value = 0;
-    for (unsigned int i=0; i<=IndexNum; ++i)
-    {
-      if (index_front[i] == min_index_in_front)
-      {
-        index_front_start[i] += 1;
-        index_front[i] = (index_front_start[i] < index_front_end[i]) ? 
*index_front_start[i] : B_size2;
-
-        row_C_value += values_A[i] * *value_front_start[i];
-        value_front_start[i] += 1;
-      }
-    }
-
-    // write current entry:
-    *output_ptr = min_index_in_front;
-    ++output_ptr;
-    *row_C_vector_output_values = row_C_value;
-    ++row_C_vector_output_values;
-  }
-
-  return static_cast<unsigned int>(output_ptr - row_C_vector_output);
-}
-
-
-
-#ifdef VIENNACL_WITH_AVX2
-inline
-unsigned int row_C_scan_numeric_vector_AVX2(int const *row_indices_B_begin, 
int const *row_indices_B_end, double const *values_A,
-                                             int const *B_row_buffer, int 
const *B_col_buffer, double const *B_elements,
-                                             int B_size2,
-                                             int *row_C_vector_output, double 
*row_C_vector_output_values)
-{
-  __m256i avx_all_ones    = _mm256_set_epi32(1, 1, 1, 1, 1, 1, 1, 1);
-  __m256i avx_all_bsize2  = _mm256_set_epi32(B_size2, B_size2, B_size2, 
B_size2, B_size2, B_size2, B_size2, B_size2);
-
-  __m256i avx_row_indices_offsets = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
-  __m256i avx_load_mask = _mm256_sub_epi32(avx_row_indices_offsets, 
_mm256_set1_epi32(row_indices_B_end - row_indices_B_begin));
-  __m256i avx_load_mask2 = avx_load_mask;
-
-  __m256i avx_row_indices = _mm256_set1_epi32(0);
-          avx_row_indices = _mm256_mask_i32gather_epi32(avx_row_indices, 
row_indices_B_begin, avx_row_indices_offsets, avx_load_mask, 4);
-
-  // load values from A:
-  avx_load_mask = avx_load_mask2; // reload mask (destroyed by gather)
-  __m256d avx_value_A_low  = _mm256_mask_i32gather_pd(_mm256_set_pd(0, 0, 0, 
0), //src
-                                                      values_A,                
  //base ptr
-                                                      
_mm256_extractf128_si256(avx_row_indices_offsets, 0),                           
//indices
-                                                      
_mm256_permutevar8x32_epi32(avx_load_mask, _mm256_set_epi32(3, 7, 2, 6, 1, 5, 
0, 4)), 8); // mask
-  avx_load_mask = avx_load_mask2; // reload mask (destroyed by gather)
-  __m256d avx_value_A_high  = _mm256_mask_i32gather_pd(_mm256_set_pd(0, 0, 0, 
0), //src
-                                                       values_A,               
   //base ptr
-                                                       
_mm256_extractf128_si256(avx_row_indices_offsets, 1),                           
//indices
-                                                       
_mm256_permutevar8x32_epi32(avx_load_mask, _mm256_set_epi32(7, 3, 6, 2, 5, 1, 
4, 0)), 8); // mask
-
-
-            avx_load_mask = avx_load_mask2; // reload mask (destroyed by 
gather)
-  __m256i avx_row_start   = _mm256_mask_i32gather_epi32(avx_all_ones, 
B_row_buffer,   avx_row_indices, avx_load_mask, 4);
-            avx_load_mask = avx_load_mask2; // reload mask (destroyed by 
gather)
-  __m256i avx_row_end     = _mm256_mask_i32gather_epi32(avx_all_ones, 
B_row_buffer+1, avx_row_indices, avx_load_mask, 4);
-
-          avx_load_mask   = _mm256_cmpgt_epi32(avx_row_end, avx_row_start);
-          avx_load_mask2  = avx_load_mask;
-  __m256i avx_index_front = avx_all_bsize2;
-  avx_index_front         = _mm256_mask_i32gather_epi32(avx_index_front, 
B_col_buffer, avx_row_start, avx_load_mask, 4);
-
-  // load front values from B:
-  avx_load_mask = avx_load_mask2; // reload mask (destroyed by gather)
-  __m256d avx_value_front_low  = _mm256_mask_i32gather_pd(_mm256_set_pd(0, 0, 
0, 0), //src
-                                                          B_elements,          
        //base ptr
-                                                          
_mm256_extractf128_si256(avx_row_start, 0),                           //indices
-                                                          
_mm256_permutevar8x32_epi32(avx_load_mask, _mm256_set_epi32(3, 7, 2, 6, 1, 5, 
0, 4)), 8); // mask
-  avx_load_mask = avx_load_mask2; // reload mask (destroyed by gather)
-  __m256d avx_value_front_high  = _mm256_mask_i32gather_pd(_mm256_set_pd(0, 0, 
0, 0), //src
-                                                           B_elements,         
         //base ptr
-                                                           
_mm256_extractf128_si256(avx_row_start, 1),                           //indices
-                                                           
_mm256_permutevar8x32_epi32(avx_load_mask, _mm256_set_epi32(7, 3, 6, 2, 5, 1, 
4, 0)), 8); // mask
-
-  int *output_ptr = row_C_vector_output;
-
-  while (1)
-  {
-    // get minimum index in current front:
-    __m256i avx_index_min1 = avx_index_front;
-    __m256i avx_temp       = _mm256_permutevar8x32_epi32(avx_index_min1, 
_mm256_set_epi32(3, 2, 1, 0, 7, 6, 5, 4));
-    avx_index_min1 = _mm256_min_epi32(avx_index_min1, avx_temp); // first four 
elements compared against last four elements
-
-    avx_temp       = _mm256_shuffle_epi32(avx_index_min1, int(78));    // 
0b01001110 = 78, using shuffle instead of permutevar here because of lower 
latency
-    avx_index_min1 = _mm256_min_epi32(avx_index_min1, avx_temp); // first two 
elements compared against elements three and four (same for upper half of 
register)
-
-    avx_temp       = _mm256_shuffle_epi32(avx_index_min1, int(177));    // 
0b10110001 = 177, using shuffle instead of permutevar here because of lower 
latency
-    avx_index_min1 = _mm256_min_epi32(avx_index_min1, avx_temp); // now all 
entries of avx_index_min1 hold the minimum
-
-    int min_index_in_front = ((int*)&avx_index_min1)[0];
-    // check for end of merge operation:
-    if (min_index_in_front == B_size2)
-      break;
-
-    // accumulate value (can certainly be done more elegantly...)
-    double value = 0;
-    value += (min_index_in_front == ((int*)&avx_index_front)[0]) ? 
((double*)&avx_value_front_low)[0] * ((double*)&avx_value_A_low)[0] : 0;
-    value += (min_index_in_front == ((int*)&avx_index_front)[1]) ? 
((double*)&avx_value_front_low)[1] * ((double*)&avx_value_A_low)[1] : 0;
-    value += (min_index_in_front == ((int*)&avx_index_front)[2]) ? 
((double*)&avx_value_front_low)[2] * ((double*)&avx_value_A_low)[2] : 0;
-    value += (min_index_in_front == ((int*)&avx_index_front)[3]) ? 
((double*)&avx_value_front_low)[3] * ((double*)&avx_value_A_low)[3] : 0;
-    value += (min_index_in_front == ((int*)&avx_index_front)[4]) ? 
((double*)&avx_value_front_high)[0] * ((double*)&avx_value_A_high)[0] : 0;
-    value += (min_index_in_front == ((int*)&avx_index_front)[5]) ? 
((double*)&avx_value_front_high)[1] * ((double*)&avx_value_A_high)[1] : 0;
-    value += (min_index_in_front == ((int*)&avx_index_front)[6]) ? 
((double*)&avx_value_front_high)[2] * ((double*)&avx_value_A_high)[2] : 0;
-    value += (min_index_in_front == ((int*)&avx_index_front)[7]) ? 
((double*)&avx_value_front_high)[3] * ((double*)&avx_value_A_high)[3] : 0;
-    *row_C_vector_output_values = value;
-    ++row_C_vector_output_values;
-
-    // write current entry:
-    *output_ptr = min_index_in_front;
-    ++output_ptr;
-
-    // advance index front where equal to minimum index:
-    avx_load_mask   = _mm256_cmpeq_epi32(avx_index_front, avx_index_min1);
-    // first part: set index to B_size2 if equal to minimum index:
-    avx_temp        = _mm256_and_si256(avx_all_bsize2, avx_load_mask);
-    avx_index_front = _mm256_max_epi32(avx_index_front, avx_temp);
-    // second part: increment row_start registers where minimum found:
-    avx_temp        = _mm256_and_si256(avx_all_ones, avx_load_mask); //ones 
only where the minimum was found
-    avx_row_start   = _mm256_add_epi32(avx_row_start, avx_temp);
-    // third part part: load new data where more entries available:
-    avx_load_mask   = _mm256_cmpgt_epi32(avx_row_end, avx_row_start);
-    avx_load_mask2  = avx_load_mask;
-    avx_index_front = _mm256_mask_i32gather_epi32(avx_index_front, 
B_col_buffer, avx_row_start, avx_load_mask, 4);
-
-    // load new values where necessary:
-    avx_load_mask = avx_load_mask2; // reload mask (destroyed by gather)
-    avx_value_front_low = _mm256_mask_i32gather_pd(avx_value_front_low, //src
-                                            B_elements,                  
//base ptr
-                                            
_mm256_extractf128_si256(avx_row_start, 0),                           //indices
-                                            
_mm256_permutevar8x32_epi32(avx_load_mask, _mm256_set_epi32(3, 7, 2, 6, 1, 5, 
0, 4)), 8); // mask
-
-    avx_load_mask = avx_load_mask2; // reload mask (destroyed by gather)
-    avx_value_front_high = _mm256_mask_i32gather_pd(avx_value_front_high, //src
-                                    B_elements,                  //base ptr
-                                    _mm256_extractf128_si256(avx_row_start, 
1),                           //indices
-                                    _mm256_permutevar8x32_epi32(avx_load_mask, 
_mm256_set_epi32(7, 3, 6, 2, 5, 1, 4, 0)), 8); // mask
-
-    //multiply new entries:
-
-  }
-
-  return static_cast<unsigned int>(output_ptr - row_C_vector_output);
-}
-#endif
-
-
-template<typename NumericT>
-unsigned int row_C_scan_numeric_vector_1(unsigned int const 
*input1_index_begin, unsigned int const *input1_index_end, NumericT const 
*input1_values_begin, NumericT factor1,
-                                         unsigned int const 
*input2_index_begin, unsigned int const *input2_index_end, NumericT const 
*input2_values_begin, NumericT factor2,
-                                         unsigned int termination_index,
-                                         unsigned int *output_index_begin, 
NumericT *output_values_begin)
-{
-  unsigned int *output_ptr = output_index_begin;
-
-  unsigned int index1 = (input1_index_begin < input1_index_end) ? 
*input1_index_begin : termination_index;
-  unsigned int index2 = (input2_index_begin < input2_index_end) ? 
*input2_index_begin : termination_index;
-
-  while (1)
-  {
-    unsigned int min_index = std::min(index1, index2);
-    NumericT value = 0;
-
-    if (min_index == termination_index)
-      break;
-
-    if (min_index == index1)
-    {
-      ++input1_index_begin;
-      index1 = (input1_index_begin < input1_index_end) ? *input1_index_begin : 
termination_index;
-
-      value += factor1 * *input1_values_begin;
-      ++input1_values_begin;
-    }
-
-    if (min_index == index2)
-    {
-      ++input2_index_begin;
-      index2 = (input2_index_begin < input2_index_end) ? *input2_index_begin : 
termination_index;
-
-      value += factor2 * *input2_values_begin;
-      ++input2_values_begin;
-    }
-
-    // write current entry:
-    *output_ptr = min_index;
-    ++output_ptr;
-    *output_values_begin = value;
-    ++output_values_begin;
-  }
-
-  return static_cast<unsigned int>(output_ptr - output_index_begin);
-}
-
-template<typename NumericT>
-void row_C_scan_numeric_vector(unsigned int row_start_A, unsigned int 
row_end_A, unsigned int const *A_col_buffer, NumericT const *A_elements,
-                               unsigned int const *B_row_buffer, unsigned int 
const *B_col_buffer, NumericT const *B_elements, unsigned int B_size2,
-                               unsigned int row_start_C, unsigned int 
row_end_C, unsigned int *C_col_buffer, NumericT *C_elements,
-                               unsigned int *row_C_vector_1, NumericT 
*row_C_vector_1_values,
-                               unsigned int *row_C_vector_2, NumericT 
*row_C_vector_2_values,
-                               unsigned int *row_C_vector_3, NumericT 
*row_C_vector_3_values)
-{
-  (void)row_end_C;
-
-  // Trivial case: row length 0:
-  if (row_start_A == row_end_A)
-    return;
-
-  // Trivial case: row length 1:
-  if (row_end_A - row_start_A == 1)
-  {
-    unsigned int A_col = A_col_buffer[row_start_A];
-    unsigned int B_end = B_row_buffer[A_col + 1];
-    NumericT A_value   = A_elements[row_start_A];
-    C_col_buffer += row_start_C;
-    C_elements += row_start_C;
-    for (unsigned int j = B_row_buffer[A_col]; j < B_end; ++j, ++C_col_buffer, 
++C_elements)
-    {
-      *C_col_buffer = B_col_buffer[j];
-      *C_elements = A_value * B_elements[j];
-    }
-    return;
-  }
-
-  unsigned int row_C_len = 0;
-  if (row_end_A - row_start_A == 2) // directly merge to C:
-  {
-    unsigned int A_col_1 = A_col_buffer[row_start_A];
-    unsigned int A_col_2 = A_col_buffer[row_start_A + 1];
-
-    unsigned int B_offset_1 = B_row_buffer[A_col_1];
-    unsigned int B_offset_2 = B_row_buffer[A_col_2];
-
-    row_C_scan_numeric_vector_1(B_col_buffer + B_offset_1, B_col_buffer + 
B_row_buffer[A_col_1+1], B_elements + B_offset_1, A_elements[row_start_A],
-                                B_col_buffer + B_offset_2, B_col_buffer + 
B_row_buffer[A_col_2+1], B_elements + B_offset_2, A_elements[row_start_A + 1],
-                                B_size2,
-                                C_col_buffer + row_start_C, C_elements + 
row_start_C);
-    return;
-  }
-#ifdef VIENNACL_WITH_AVX2
-  else if (row_end_A - row_start_A > 10) // safely merge eight rows into 
temporary buffer:
-  {
-    row_C_len = row_C_scan_numeric_vector_AVX2((const int*)(A_col_buffer + 
row_start_A), (const int*)(A_col_buffer + row_end_A), A_elements + row_start_A,
-                                               (const int*)B_row_buffer, 
(const int*)B_col_buffer, B_elements, int(B_size2),
-                                               (int*)row_C_vector_1, 
row_C_vector_1_values);
-    row_start_A += 8;
-  }
-#endif
-  else // safely merge two rows into temporary buffer:
-  {
-    unsigned int A_col_1 = A_col_buffer[row_start_A];
-    unsigned int A_col_2 = A_col_buffer[row_start_A + 1];
-
-    unsigned int B_offset_1 = B_row_buffer[A_col_1];
-    unsigned int B_offset_2 = B_row_buffer[A_col_2];
-
-    row_C_len = row_C_scan_numeric_vector_1(B_col_buffer + B_offset_1, 
B_col_buffer + B_row_buffer[A_col_1+1], B_elements + B_offset_1, 
A_elements[row_start_A],
-                                            B_col_buffer + B_offset_2, 
B_col_buffer + B_row_buffer[A_col_2+1], B_elements + B_offset_2, 
A_elements[row_start_A + 1],
-                                            B_size2,
-                                            row_C_vector_1, 
row_C_vector_1_values);
-    row_start_A += 2;
-  }
-
-  // process remaining rows:
-  while (row_end_A > row_start_A)
-  {
-#ifdef VIENNACL_WITH_AVX2
-    if (row_end_A - row_start_A > 9) // code in other if-conditionals ensures 
that values get written to C
-    {
-      unsigned int merged_len = row_C_scan_numeric_vector_AVX2((const 
int*)(A_col_buffer + row_start_A), (const int*)(A_col_buffer + row_end_A), 
A_elements + row_start_A,
-                                                               (const 
int*)B_row_buffer, (const int*)B_col_buffer, B_elements, int(B_size2),
-                                                               
(int*)row_C_vector_3, row_C_vector_3_values);
-      row_C_len = row_C_scan_numeric_vector_1(row_C_vector_3, row_C_vector_3 + 
merged_len, row_C_vector_3_values, NumericT(1.0),
-                                              row_C_vector_1, row_C_vector_1 + 
row_C_len, row_C_vector_1_values, NumericT(1.0),
-                                              B_size2,
-                                              row_C_vector_2, 
row_C_vector_2_values);
-      row_start_A += 8;
-    }
-    else
-#endif
-    if (row_start_A + 1 == row_end_A) // last row to merge, write directly to 
C:
-    {
-      unsigned int A_col    = A_col_buffer[row_start_A];
-      unsigned int B_offset = B_row_buffer[A_col];
-
-      row_C_len = row_C_scan_numeric_vector_1(B_col_buffer + B_offset, 
B_col_buffer + B_row_buffer[A_col+1], B_elements + B_offset, 
A_elements[row_start_A],
-                                              row_C_vector_1, row_C_vector_1 + 
row_C_len, row_C_vector_1_values, NumericT(1.0),
-                                              B_size2,
-                                              C_col_buffer + row_start_C, 
C_elements + row_start_C);
-      return;
-    }
-    else if (row_start_A + 2 < row_end_A)// at least three more rows left, so 
merge two
-    {
-      // process single row:
-      unsigned int A_col_1 = A_col_buffer[row_start_A];
-      unsigned int A_col_2 = A_col_buffer[row_start_A + 1];
-
-      unsigned int B_offset_1 = B_row_buffer[A_col_1];
-      unsigned int B_offset_2 = B_row_buffer[A_col_2];
-
-      unsigned int merged_len = row_C_scan_numeric_vector_1(B_col_buffer + 
B_offset_1, B_col_buffer + B_row_buffer[A_col_1+1], B_elements + B_offset_1, 
A_elements[row_start_A],
-                                                            B_col_buffer + 
B_offset_2, B_col_buffer + B_row_buffer[A_col_2+1], B_elements + B_offset_2, 
A_elements[row_start_A + 1],
-                                                            B_size2,
-                                                            row_C_vector_3, 
row_C_vector_3_values);
-      row_C_len = row_C_scan_numeric_vector_1(row_C_vector_3, row_C_vector_3 + 
merged_len, row_C_vector_3_values, NumericT(1.0),
-                                              row_C_vector_1, row_C_vector_1 + 
row_C_len,  row_C_vector_1_values, NumericT(1.0),
-                                              B_size2,
-                                              row_C_vector_2, 
row_C_vector_2_values);
-      row_start_A += 2;
-    }
-    else
-    {
-      unsigned int A_col    = A_col_buffer[row_start_A];
-      unsigned int B_offset = B_row_buffer[A_col];
-
-      row_C_len = row_C_scan_numeric_vector_1(B_col_buffer + B_offset, 
B_col_buffer + B_row_buffer[A_col+1], B_elements + B_offset, 
A_elements[row_start_A],
-                                              row_C_vector_1, row_C_vector_1 + 
row_C_len, row_C_vector_1_values, NumericT(1.0),
-                                              B_size2,
-                                              row_C_vector_2, 
row_C_vector_2_values);
-      ++row_start_A;
-    }
-
-    std::swap(row_C_vector_1,        row_C_vector_2);
-    std::swap(row_C_vector_1_values, row_C_vector_2_values);
-  }
-}
-
-
-} // namespace host_based
-} //namespace linalg
-} //namespace viennacl
-
-
-#endif

http://git-wip-us.apache.org/repos/asf/mahout/blob/7ae549fa/native-viennaCL/src/main/cpp/viennacl/linalg/host_based/vector_operations.hpp
----------------------------------------------------------------------
diff --git 
a/native-viennaCL/src/main/cpp/viennacl/linalg/host_based/vector_operations.hpp 
b/native-viennaCL/src/main/cpp/viennacl/linalg/host_based/vector_operations.hpp
deleted file mode 100644
index b4944a2..0000000
--- 
a/native-viennaCL/src/main/cpp/viennacl/linalg/host_based/vector_operations.hpp
+++ /dev/null
@@ -1,1188 +0,0 @@
-#ifndef VIENNACL_LINALG_HOST_BASED_VECTOR_OPERATIONS_HPP_
-#define VIENNACL_LINALG_HOST_BASED_VECTOR_OPERATIONS_HPP_
-
-/* =========================================================================
-   Copyright (c) 2010-2016, Institute for Microelectronics,
-                            Institute for Analysis and Scientific Computing,
-                            TU Wien.
-   Portions of this software are copyright by UChicago Argonne, LLC.
-
-                            -----------------
-                  ViennaCL - The Vienna Computing Library
-                            -----------------
-
-   Project Head:    Karl Rupp                   [email protected]
-
-   (A list of authors and contributors can be found in the manual)
-
-   License:         MIT (X11), see file LICENSE in the base directory
-============================================================================= 
*/
-
-/** @file viennacl/linalg/host_based/vector_operations.hpp
-    @brief Implementations of vector operations using a plain single-threaded 
or OpenMP-enabled execution on CPU
-*/
-
-#include <cmath>
-#include <algorithm>  //for std::max and std::min
-
-#include "viennacl/forwards.h"
-#include "viennacl/scalar.hpp"
-#include "viennacl/tools/tools.hpp"
-#include "viennacl/meta/predicate.hpp"
-#include "viennacl/meta/enable_if.hpp"
-#include "viennacl/traits/size.hpp"
-#include "viennacl/traits/start.hpp"
-#include "viennacl/linalg/host_based/common.hpp"
-#include "viennacl/linalg/detail/op_applier.hpp"
-#include "viennacl/traits/stride.hpp"
-
-#ifdef VIENNACL_WITH_OPENMP
-#include <omp.h>
-#endif
-
-// Minimum vector size for using OpenMP on vector operations:
-#ifndef VIENNACL_OPENMP_VECTOR_MIN_SIZE
-  #define VIENNACL_OPENMP_VECTOR_MIN_SIZE  5000
-#endif
-
-namespace viennacl
-{
-namespace linalg
-{
-namespace host_based
-{
-namespace detail
-{
-  template<typename NumericT>
-  NumericT flip_sign(NumericT val) { return -val; }
-  inline unsigned long  flip_sign(unsigned long  val) { return val; }
-  inline unsigned int   flip_sign(unsigned int   val) { return val; }
-  inline unsigned short flip_sign(unsigned short val) { return val; }
-  inline unsigned char  flip_sign(unsigned char  val) { return val; }
-}
-
-//
-// Introductory note: By convention, all dimensions are already checked in the 
dispatcher frontend. No need to double-check again in here!
-//
-template<typename DestNumericT, typename SrcNumericT>
-void convert(vector_base<DestNumericT> & dest, vector_base<SrcNumericT> const 
& src)
-{
-  DestNumericT      * data_dest = 
detail::extract_raw_pointer<DestNumericT>(dest);
-  SrcNumericT const * data_src  = 
detail::extract_raw_pointer<SrcNumericT>(src);
-
-  vcl_size_t start_dest = viennacl::traits::start(dest);
-  vcl_size_t inc_dest   = viennacl::traits::stride(dest);
-  vcl_size_t size_dest  = viennacl::traits::size(dest);
-
-  vcl_size_t start_src = viennacl::traits::start(src);
-  vcl_size_t inc_src   = viennacl::traits::stride(src);
-
-#ifdef VIENNACL_WITH_OPENMP
-  #pragma omp parallel for if (size_dest > VIENNACL_OPENMP_VECTOR_MIN_SIZE)
-#endif
-  for (long i = 0; i < static_cast<long>(size_dest); ++i)
-    data_dest[static_cast<vcl_size_t>(i)*inc_dest+start_dest] = 
static_cast<DestNumericT>(data_src[static_cast<vcl_size_t>(i)*inc_src+start_src]);
-}
-
-template<typename NumericT, typename ScalarT1>
-void av(vector_base<NumericT> & vec1,
-        vector_base<NumericT> const & vec2, ScalarT1 const & alpha, vcl_size_t 
/*len_alpha*/, bool reciprocal_alpha, bool flip_sign_alpha)
-{
-  typedef NumericT        value_type;
-
-  value_type       * data_vec1 = detail::extract_raw_pointer<value_type>(vec1);
-  value_type const * data_vec2 = detail::extract_raw_pointer<value_type>(vec2);
-
-  value_type data_alpha = alpha;
-  if (flip_sign_alpha)
-    data_alpha = detail::flip_sign(data_alpha);
-
-  vcl_size_t start1 = viennacl::traits::start(vec1);
-  vcl_size_t inc1   = viennacl::traits::stride(vec1);
-  vcl_size_t size1  = viennacl::traits::size(vec1);
-
-  vcl_size_t start2 = viennacl::traits::start(vec2);
-  vcl_size_t inc2   = viennacl::traits::stride(vec2);
-
-  if (reciprocal_alpha)
-  {
-#ifdef VIENNACL_WITH_OPENMP
-    #pragma omp parallel for if (size1 > VIENNACL_OPENMP_VECTOR_MIN_SIZE)
-#endif
-    for (long i = 0; i < static_cast<long>(size1); ++i)
-      data_vec1[static_cast<vcl_size_t>(i)*inc1+start1] = 
data_vec2[static_cast<vcl_size_t>(i)*inc2+start2] / data_alpha;
-  }
-  else
-  {
-#ifdef VIENNACL_WITH_OPENMP
-    #pragma omp parallel for if (size1 > VIENNACL_OPENMP_VECTOR_MIN_SIZE)
-#endif
-    for (long i = 0; i < static_cast<long>(size1); ++i)
-      data_vec1[static_cast<vcl_size_t>(i)*inc1+start1] = 
data_vec2[static_cast<vcl_size_t>(i)*inc2+start2] * data_alpha;
-  }
-}
-
-
-template<typename NumericT, typename ScalarT1, typename ScalarT2>
-void avbv(vector_base<NumericT> & vec1,
-          vector_base<NumericT> const & vec2, ScalarT1 const & alpha, 
vcl_size_t /* len_alpha */, bool reciprocal_alpha, bool flip_sign_alpha,
-          vector_base<NumericT> const & vec3, ScalarT2 const & beta,  
vcl_size_t /* len_beta */,  bool reciprocal_beta,  bool flip_sign_beta)
-{
-  typedef NumericT      value_type;
-
-  value_type       * data_vec1 = detail::extract_raw_pointer<value_type>(vec1);
-  value_type const * data_vec2 = detail::extract_raw_pointer<value_type>(vec2);
-  value_type const * data_vec3 = detail::extract_raw_pointer<value_type>(vec3);
-
-  value_type data_alpha = alpha;
-  if (flip_sign_alpha)
-    data_alpha = detail::flip_sign(data_alpha);
-
-  value_type data_beta = beta;
-  if (flip_sign_beta)
-    data_beta = detail::flip_sign(data_beta);
-
-  vcl_size_t start1 = viennacl::traits::start(vec1);
-  vcl_size_t inc1   = viennacl::traits::stride(vec1);
-  vcl_size_t size1  = viennacl::traits::size(vec1);
-
-  vcl_size_t start2 = viennacl::traits::start(vec2);
-  vcl_size_t inc2   = viennacl::traits::stride(vec2);
-
-  vcl_size_t start3 = viennacl::traits::start(vec3);
-  vcl_size_t inc3   = viennacl::traits::stride(vec3);
-
-  if (reciprocal_alpha)
-  {
-    if (reciprocal_beta)
-    {
-#ifdef VIENNACL_WITH_OPENMP
-      #pragma omp parallel for if (size1 > VIENNACL_OPENMP_VECTOR_MIN_SIZE)
-#endif
-      for (long i = 0; i < static_cast<long>(size1); ++i)
-        data_vec1[static_cast<vcl_size_t>(i)*inc1+start1] = 
data_vec2[static_cast<vcl_size_t>(i)*inc2+start2] / data_alpha + 
data_vec3[static_cast<vcl_size_t>(i)*inc3+start3] / data_beta;
-    }
-    else
-    {
-#ifdef VIENNACL_WITH_OPENMP
-      #pragma omp parallel for if (size1 > VIENNACL_OPENMP_VECTOR_MIN_SIZE)
-#endif
-      for (long i = 0; i < static_cast<long>(size1); ++i)
-        data_vec1[static_cast<vcl_size_t>(i)*inc1+start1] = 
data_vec2[static_cast<vcl_size_t>(i)*inc2+start2] / data_alpha + 
data_vec3[static_cast<vcl_size_t>(i)*inc3+start3] * data_beta;
-    }
-  }
-  else
-  {
-    if (reciprocal_beta)
-    {
-#ifdef VIENNACL_WITH_OPENMP
-      #pragma omp parallel for if (size1 > VIENNACL_OPENMP_VECTOR_MIN_SIZE)
-#endif
-      for (long i = 0; i < static_cast<long>(size1); ++i)
-        data_vec1[static_cast<vcl_size_t>(i)*inc1+start1] = 
data_vec2[static_cast<vcl_size_t>(i)*inc2+start2] * data_alpha + 
data_vec3[static_cast<vcl_size_t>(i)*inc3+start3] / data_beta;
-    }
-    else
-    {
-#ifdef VIENNACL_WITH_OPENMP
-      #pragma omp parallel for if (size1 > VIENNACL_OPENMP_VECTOR_MIN_SIZE)
-#endif
-      for (long i = 0; i < static_cast<long>(size1); ++i)
-        data_vec1[static_cast<vcl_size_t>(i)*inc1+start1] = 
data_vec2[static_cast<vcl_size_t>(i)*inc2+start2] * data_alpha + 
data_vec3[static_cast<vcl_size_t>(i)*inc3+start3] * data_beta;
-    }
-  }
-}
-
-
-template<typename NumericT, typename ScalarT1, typename ScalarT2>
-void avbv_v(vector_base<NumericT> & vec1,
-            vector_base<NumericT> const & vec2, ScalarT1 const & alpha, 
vcl_size_t /*len_alpha*/, bool reciprocal_alpha, bool flip_sign_alpha,
-            vector_base<NumericT> const & vec3, ScalarT2 const & beta,  
vcl_size_t /*len_beta*/,  bool reciprocal_beta,  bool flip_sign_beta)
-{
-  typedef NumericT        value_type;
-
-  value_type       * data_vec1 = detail::extract_raw_pointer<value_type>(vec1);
-  value_type const * data_vec2 = detail::extract_raw_pointer<value_type>(vec2);
-  value_type const * data_vec3 = detail::extract_raw_pointer<value_type>(vec3);
-
-  value_type data_alpha = alpha;
-  if (flip_sign_alpha)
-    data_alpha = detail::flip_sign(data_alpha);
-
-  value_type data_beta = beta;
-  if (flip_sign_beta)
-    data_beta = detail::flip_sign(data_beta);
-
-  vcl_size_t start1 = viennacl::traits::start(vec1);
-  vcl_size_t inc1   = viennacl::traits::stride(vec1);
-  vcl_size_t size1  = viennacl::traits::size(vec1);
-
-  vcl_size_t start2 = viennacl::traits::start(vec2);
-  vcl_size_t inc2   = viennacl::traits::stride(vec2);
-
-  vcl_size_t start3 = viennacl::traits::start(vec3);
-  vcl_size_t inc3   = viennacl::traits::stride(vec3);
-
-  if (reciprocal_alpha)
-  {
-    if (reciprocal_beta)
-    {
-#ifdef VIENNACL_WITH_OPENMP
-      #pragma omp parallel for if (size1 > VIENNACL_OPENMP_VECTOR_MIN_SIZE)
-#endif
-      for (long i = 0; i < static_cast<long>(size1); ++i)
-        data_vec1[static_cast<vcl_size_t>(i)*inc1+start1] += 
data_vec2[static_cast<vcl_size_t>(i)*inc2+start2] / data_alpha + 
data_vec3[static_cast<vcl_size_t>(i)*inc3+start3] / data_beta;
-    }
-    else
-    {
-#ifdef VIENNACL_WITH_OPENMP
-      #pragma omp parallel for if (size1 > VIENNACL_OPENMP_VECTOR_MIN_SIZE)
-#endif
-      for (long i = 0; i < static_cast<long>(size1); ++i)
-        data_vec1[static_cast<vcl_size_t>(i)*inc1+start1] += 
data_vec2[static_cast<vcl_size_t>(i)*inc2+start2] / data_alpha + 
data_vec3[static_cast<vcl_size_t>(i)*inc3+start3] * data_beta;
-    }
-  }
-  else
-  {
-    if (reciprocal_beta)
-    {
-#ifdef VIENNACL_WITH_OPENMP
-      #pragma omp parallel for if (size1 > VIENNACL_OPENMP_VECTOR_MIN_SIZE)
-#endif
-      for (long i = 0; i < static_cast<long>(size1); ++i)
-        data_vec1[static_cast<vcl_size_t>(i)*inc1+start1] += 
data_vec2[static_cast<vcl_size_t>(i)*inc2+start2] * data_alpha + 
data_vec3[static_cast<vcl_size_t>(i)*inc3+start3] / data_beta;
-    }
-    else
-    {
-#ifdef VIENNACL_WITH_OPENMP
-      #pragma omp parallel for if (size1 > VIENNACL_OPENMP_VECTOR_MIN_SIZE)
-#endif
-      for (long i = 0; i < static_cast<long>(size1); ++i)
-        data_vec1[static_cast<vcl_size_t>(i)*inc1+start1] += 
data_vec2[static_cast<vcl_size_t>(i)*inc2+start2] * data_alpha + 
data_vec3[static_cast<vcl_size_t>(i)*inc3+start3] * data_beta;
-    }
-  }
-}
-
-
-
-
-/** @brief Assign a constant value to a vector (-range/-slice)
-*
-* @param vec1   The vector to which the value should be assigned
-* @param alpha  The value to be assigned
-* @param up_to_internal_size  Specifies whether alpha should also be written 
to padded memory (mostly used for clearing the whole buffer).
-*/
-template<typename NumericT>
-void vector_assign(vector_base<NumericT> & vec1, const NumericT & alpha, bool 
up_to_internal_size = false)
-{
-  typedef NumericT       value_type;
-
-  value_type * data_vec1 = detail::extract_raw_pointer<value_type>(vec1);
-
-  vcl_size_t start1 = viennacl::traits::start(vec1);
-  vcl_size_t inc1   = viennacl::traits::stride(vec1);
-  vcl_size_t size1  = viennacl::traits::size(vec1);
-  vcl_size_t loop_bound  = up_to_internal_size ? vec1.internal_size() : size1; 
 //Note: Do NOT use traits::internal_size() here, because vector proxies don't 
require padding.
-
-  value_type data_alpha = static_cast<value_type>(alpha);
-
-#ifdef VIENNACL_WITH_OPENMP
-  #pragma omp parallel for if (loop_bound > VIENNACL_OPENMP_VECTOR_MIN_SIZE)
-#endif
-  for (long i = 0; i < static_cast<long>(loop_bound); ++i)
-    data_vec1[static_cast<vcl_size_t>(i)*inc1+start1] = data_alpha;
-}
-
-
-/** @brief Swaps the contents of two vectors, data is copied
-*
-* @param vec1   The first vector (or -range, or -slice)
-* @param vec2   The second vector (or -range, or -slice)
-*/
-template<typename NumericT>
-void vector_swap(vector_base<NumericT> & vec1, vector_base<NumericT> & vec2)
-{
-  typedef NumericT      value_type;
-
-  value_type * data_vec1 = detail::extract_raw_pointer<value_type>(vec1);
-  value_type * data_vec2 = detail::extract_raw_pointer<value_type>(vec2);
-
-  vcl_size_t start1 = viennacl::traits::start(vec1);
-  vcl_size_t inc1   = viennacl::traits::stride(vec1);
-  vcl_size_t size1  = viennacl::traits::size(vec1);
-
-  vcl_size_t start2 = viennacl::traits::start(vec2);
-  vcl_size_t inc2   = viennacl::traits::stride(vec2);
-
-#ifdef VIENNACL_WITH_OPENMP
-  #pragma omp parallel for if (size1 > VIENNACL_OPENMP_VECTOR_MIN_SIZE)
-#endif
-  for (long i = 0; i < static_cast<long>(size1); ++i)
-  {
-    value_type temp = data_vec2[static_cast<vcl_size_t>(i)*inc2+start2];
-    data_vec2[static_cast<vcl_size_t>(i)*inc2+start2] = 
data_vec1[static_cast<vcl_size_t>(i)*inc1+start1];
-    data_vec1[static_cast<vcl_size_t>(i)*inc1+start1] = temp;
-  }
-}
-
-
-///////////////////////// Elementwise operations /////////////
-
-/** @brief Implementation of the element-wise operation v1 = v2 .* v3 and v1 = 
v2 ./ v3    (using MATLAB syntax)
-*
-* @param vec1   The result vector (or -range, or -slice)
-* @param proxy  The proxy object holding v2, v3 and the operation
-*/
-template<typename NumericT, typename OpT>
-void element_op(vector_base<NumericT> & vec1,
-                vector_expression<const vector_base<NumericT>, const 
vector_base<NumericT>, op_element_binary<OpT> > const & proxy)
-{
-  typedef NumericT                                           value_type;
-  typedef viennacl::linalg::detail::op_applier<op_element_binary<OpT> >    
OpFunctor;
-
-  value_type       * data_vec1 = detail::extract_raw_pointer<value_type>(vec1);
-  value_type const * data_vec2 = 
detail::extract_raw_pointer<value_type>(proxy.lhs());
-  value_type const * data_vec3 = 
detail::extract_raw_pointer<value_type>(proxy.rhs());
-
-  vcl_size_t start1 = viennacl::traits::start(vec1);
-  vcl_size_t inc1   = viennacl::traits::stride(vec1);
-  vcl_size_t size1  = viennacl::traits::size(vec1);
-
-  vcl_size_t start2 = viennacl::traits::start(proxy.lhs());
-  vcl_size_t inc2   = viennacl::traits::stride(proxy.lhs());
-
-  vcl_size_t start3 = viennacl::traits::start(proxy.rhs());
-  vcl_size_t inc3   = viennacl::traits::stride(proxy.rhs());
-
-#ifdef VIENNACL_WITH_OPENMP
-  #pragma omp parallel for if (size1 > VIENNACL_OPENMP_VECTOR_MIN_SIZE)
-#endif
-  for (long i = 0; i < static_cast<long>(size1); ++i)
-    OpFunctor::apply(data_vec1[static_cast<vcl_size_t>(i)*inc1+start1], 
data_vec2[static_cast<vcl_size_t>(i)*inc2+start2], 
data_vec3[static_cast<vcl_size_t>(i)*inc3+start3]);
-}
-
-/** @brief Implementation of the element-wise operation v1 = v2 .* v3 and v1 = 
v2 ./ v3    (using MATLAB syntax)
-*
-* @param vec1   The result vector (or -range, or -slice)
-* @param proxy  The proxy object holding v2, v3 and the operation
-*/
-template<typename NumericT, typename OpT>
-void element_op(vector_base<NumericT> & vec1,
-                vector_expression<const vector_base<NumericT>, const 
vector_base<NumericT>, op_element_unary<OpT> > const & proxy)
-{
-  typedef NumericT      value_type;
-  typedef viennacl::linalg::detail::op_applier<op_element_unary<OpT> >    
OpFunctor;
-
-  value_type       * data_vec1 = detail::extract_raw_pointer<value_type>(vec1);
-  value_type const * data_vec2 = 
detail::extract_raw_pointer<value_type>(proxy.lhs());
-
-  vcl_size_t start1 = viennacl::traits::start(vec1);
-  vcl_size_t inc1   = viennacl::traits::stride(vec1);
-  vcl_size_t size1  = viennacl::traits::size(vec1);
-
-  vcl_size_t start2 = viennacl::traits::start(proxy.lhs());
-  vcl_size_t inc2   = viennacl::traits::stride(proxy.lhs());
-
-#ifdef VIENNACL_WITH_OPENMP
-  #pragma omp parallel for if (size1 > VIENNACL_OPENMP_VECTOR_MIN_SIZE)
-#endif
-  for (long i = 0; i < static_cast<long>(size1); ++i)
-    OpFunctor::apply(data_vec1[static_cast<vcl_size_t>(i)*inc1+start1], 
data_vec2[static_cast<vcl_size_t>(i)*inc2+start2]);
-}
-
-
-///////////////////////// Norms and inner product ///////////////////
-
-
-//implementation of inner product:
-
-namespace detail
-{
-
-// the following circumvents problems when trying to use a variable of 
template parameter type for a reduction.
-// Such a behavior is not covered by the OpenMP standard, hence we manually 
apply some preprocessor magic to resolve the problem.
-// See https://github.com/viennacl/viennacl-dev/issues/112 for a detailed 
explanation and discussion.
-
-#define VIENNACL_INNER_PROD_IMPL_1(RESULTSCALART, TEMPSCALART) \
-  inline RESULTSCALART inner_prod_impl(RESULTSCALART const * data_vec1, 
vcl_size_t start1, vcl_size_t inc1, vcl_size_t size1, \
-                                       RESULTSCALART const * data_vec2, 
vcl_size_t start2, vcl_size_t inc2) { \
-    TEMPSCALART temp = 0;
-
-#define VIENNACL_INNER_PROD_IMPL_2(RESULTSCALART) \
-    for (long i = 0; i < static_cast<long>(size1); ++i) \
-      temp += data_vec1[static_cast<vcl_size_t>(i)*inc1+start1] * 
data_vec2[static_cast<vcl_size_t>(i)*inc2+start2]; \
-    return static_cast<RESULTSCALART>(temp); \
-  }
-
-// char
-VIENNACL_INNER_PROD_IMPL_1(char, int)
-#ifdef VIENNACL_WITH_OPENMP
-  #pragma omp parallel for reduction(+: temp) if (size1 > 
VIENNACL_OPENMP_VECTOR_MIN_SIZE)
-#endif
-VIENNACL_INNER_PROD_IMPL_2(char)
-
-VIENNACL_INNER_PROD_IMPL_1(unsigned char, int)
-#ifdef VIENNACL_WITH_OPENMP
-  #pragma omp parallel for reduction(+: temp) if (size1 > 
VIENNACL_OPENMP_VECTOR_MIN_SIZE)
-#endif
-VIENNACL_INNER_PROD_IMPL_2(unsigned char)
-
-
-// short
-VIENNACL_INNER_PROD_IMPL_1(short, int)
-#ifdef VIENNACL_WITH_OPENMP
-  #pragma omp parallel for reduction(+: temp) if (size1 > 
VIENNACL_OPENMP_VECTOR_MIN_SIZE)
-#endif
-VIENNACL_INNER_PROD_IMPL_2(short)
-
-VIENNACL_INNER_PROD_IMPL_1(unsigned short, int)
-#ifdef VIENNACL_WITH_OPENMP
-  #pragma omp parallel for reduction(+: temp) if (size1 > 
VIENNACL_OPENMP_VECTOR_MIN_SIZE)
-#endif
-VIENNACL_INNER_PROD_IMPL_2(unsigned short)
-
-
-// int
-VIENNACL_INNER_PROD_IMPL_1(int, int)
-#ifdef VIENNACL_WITH_OPENMP
-  #pragma omp parallel for reduction(+: temp) if (size1 > 
VIENNACL_OPENMP_VECTOR_MIN_SIZE)
-#endif
-VIENNACL_INNER_PROD_IMPL_2(int)
-
-VIENNACL_INNER_PROD_IMPL_1(unsigned int, unsigned int)
-#ifdef VIENNACL_WITH_OPENMP
-  #pragma omp parallel for reduction(+: temp) if (size1 > 
VIENNACL_OPENMP_VECTOR_MIN_SIZE)
-#endif
-VIENNACL_INNER_PROD_IMPL_2(unsigned int)
-
-
-// long
-VIENNACL_INNER_PROD_IMPL_1(long, long)
-#ifdef VIENNACL_WITH_OPENMP
-  #pragma omp parallel for reduction(+: temp) if (size1 > 
VIENNACL_OPENMP_VECTOR_MIN_SIZE)
-#endif
-VIENNACL_INNER_PROD_IMPL_2(long)
-
-VIENNACL_INNER_PROD_IMPL_1(unsigned long, unsigned long)
-#ifdef VIENNACL_WITH_OPENMP
-  #pragma omp parallel for reduction(+: temp) if (size1 > 
VIENNACL_OPENMP_VECTOR_MIN_SIZE)
-#endif
-VIENNACL_INNER_PROD_IMPL_2(unsigned long)
-
-
-// float
-VIENNACL_INNER_PROD_IMPL_1(float, float)
-#ifdef VIENNACL_WITH_OPENMP
-  #pragma omp parallel for reduction(+: temp) if (size1 > 
VIENNACL_OPENMP_VECTOR_MIN_SIZE)
-#endif
-VIENNACL_INNER_PROD_IMPL_2(float)
-
-// double
-VIENNACL_INNER_PROD_IMPL_1(double, double)
-#ifdef VIENNACL_WITH_OPENMP
-  #pragma omp parallel for reduction(+: temp) if (size1 > 
VIENNACL_OPENMP_VECTOR_MIN_SIZE)
-#endif
-VIENNACL_INNER_PROD_IMPL_2(double)
-
-#undef VIENNACL_INNER_PROD_IMPL_1
-#undef VIENNACL_INNER_PROD_IMPL_2
-}
-
-/** @brief Computes the inner product of two vectors - implementation. Library 
users should call inner_prod(vec1, vec2).
-*
-* @param vec1 The first vector
-* @param vec2 The second vector
-* @param result The result scalar (on the gpu)
-*/
-template<typename NumericT, typename ScalarT>
-void inner_prod_impl(vector_base<NumericT> const & vec1,
-                     vector_base<NumericT> const & vec2,
-                     ScalarT & result)
-{
-  typedef NumericT      value_type;
-
-  value_type const * data_vec1 = detail::extract_raw_pointer<value_type>(vec1);
-  value_type const * data_vec2 = detail::extract_raw_pointer<value_type>(vec2);
-
-  vcl_size_t start1 = viennacl::traits::start(vec1);
-  vcl_size_t inc1   = viennacl::traits::stride(vec1);
-  vcl_size_t size1  = viennacl::traits::size(vec1);
-
-  vcl_size_t start2 = viennacl::traits::start(vec2);
-  vcl_size_t inc2   = viennacl::traits::stride(vec2);
-
-  result = detail::inner_prod_impl(data_vec1, start1, inc1, size1,
-                                   data_vec2, start2, inc2);  //Note: 
Assignment to result might be expensive, thus a temporary is introduced here
-}
-
-template<typename NumericT>
-void inner_prod_impl(vector_base<NumericT> const & x,
-                     vector_tuple<NumericT> const & vec_tuple,
-                     vector_base<NumericT> & result)
-{
-  typedef NumericT        value_type;
-
-  value_type const * data_x = detail::extract_raw_pointer<value_type>(x);
-
-  vcl_size_t start_x = viennacl::traits::start(x);
-  vcl_size_t inc_x   = viennacl::traits::stride(x);
-  vcl_size_t size_x  = viennacl::traits::size(x);
-
-  std::vector<value_type> temp(vec_tuple.const_size());
-  std::vector<value_type const *> data_y(vec_tuple.const_size());
-  std::vector<vcl_size_t> start_y(vec_tuple.const_size());
-  std::vector<vcl_size_t> stride_y(vec_tuple.const_size());
-
-  for (vcl_size_t j=0; j<vec_tuple.const_size(); ++j)
-  {
-    data_y[j] = detail::extract_raw_pointer<value_type>(vec_tuple.const_at(j));
-    start_y[j] = viennacl::traits::start(vec_tuple.const_at(j));
-    stride_y[j] = viennacl::traits::stride(vec_tuple.const_at(j));
-  }
-
-  // Note: No OpenMP here because it cannot perform a reduction on temp-array. 
Savings in memory bandwidth are expected to still justify this approach...
-  for (vcl_size_t i = 0; i < size_x; ++i)
-  {
-    value_type entry_x = data_x[i*inc_x+start_x];
-    for (vcl_size_t j=0; j < vec_tuple.const_size(); ++j)
-      temp[j] += entry_x * data_y[j][i*stride_y[j]+start_y[j]];
-  }
-
-  for (vcl_size_t j=0; j < vec_tuple.const_size(); ++j)
-    result[j] = temp[j];  //Note: Assignment to result might be expensive, 
thus 'temp' is used for accumulation
-}
-
-
-namespace detail
-{
-
-#define VIENNACL_NORM_1_IMPL_1(RESULTSCALART, TEMPSCALART) \
-  inline RESULTSCALART norm_1_impl(RESULTSCALART const * data_vec1, vcl_size_t 
start1, vcl_size_t inc1, vcl_size_t size1) { \
-    TEMPSCALART temp = 0;
-
-#define VIENNACL_NORM_1_IMPL_2(RESULTSCALART, TEMPSCALART) \
-    for (long i = 0; i < static_cast<long>(size1); ++i) \
-      temp += 
static_cast<TEMPSCALART>(std::fabs(static_cast<double>(data_vec1[static_cast<vcl_size_t>(i)*inc1+start1])));
 \
-    return static_cast<RESULTSCALART>(temp); \
-  }
-
-// char
-VIENNACL_NORM_1_IMPL_1(char, int)
-#ifdef VIENNACL_WITH_OPENMP
-  #pragma omp parallel for reduction(+: temp) if (size1 > 
VIENNACL_OPENMP_VECTOR_MIN_SIZE)
-#endif
-VIENNACL_NORM_1_IMPL_2(char, int)
-
-VIENNACL_NORM_1_IMPL_1(unsigned char, int)
-#ifdef VIENNACL_WITH_OPENMP
-  #pragma omp parallel for reduction(+: temp) if (size1 > 
VIENNACL_OPENMP_VECTOR_MIN_SIZE)
-#endif
-VIENNACL_NORM_1_IMPL_2(unsigned char, int)
-
-// short
-VIENNACL_NORM_1_IMPL_1(short, int)
-#ifdef VIENNACL_WITH_OPENMP
-  #pragma omp parallel for reduction(+: temp) if (size1 > 
VIENNACL_OPENMP_VECTOR_MIN_SIZE)
-#endif
-VIENNACL_NORM_1_IMPL_2(short, int)
-
-VIENNACL_NORM_1_IMPL_1(unsigned short, int)
-#ifdef VIENNACL_WITH_OPENMP
-  #pragma omp parallel for reduction(+: temp) if (size1 > 
VIENNACL_OPENMP_VECTOR_MIN_SIZE)
-#endif
-VIENNACL_NORM_1_IMPL_2(unsigned short, int)
-
-
-// int
-VIENNACL_NORM_1_IMPL_1(int, int)
-#ifdef VIENNACL_WITH_OPENMP
-  #pragma omp parallel for reduction(+: temp) if (size1 > 
VIENNACL_OPENMP_VECTOR_MIN_SIZE)
-#endif
-VIENNACL_NORM_1_IMPL_2(int, int)
-
-VIENNACL_NORM_1_IMPL_1(unsigned int, unsigned int)
-#ifdef VIENNACL_WITH_OPENMP
-  #pragma omp parallel for reduction(+: temp) if (size1 > 
VIENNACL_OPENMP_VECTOR_MIN_SIZE)
-#endif
-VIENNACL_NORM_1_IMPL_2(unsigned int, unsigned int)
-
-
-// long
-VIENNACL_NORM_1_IMPL_1(long, long)
-#ifdef VIENNACL_WITH_OPENMP
-  #pragma omp parallel for reduction(+: temp) if (size1 > 
VIENNACL_OPENMP_VECTOR_MIN_SIZE)
-#endif
-VIENNACL_NORM_1_IMPL_2(long, long)
-
-VIENNACL_NORM_1_IMPL_1(unsigned long, unsigned long)
-#ifdef VIENNACL_WITH_OPENMP
-  #pragma omp parallel for reduction(+: temp) if (size1 > 
VIENNACL_OPENMP_VECTOR_MIN_SIZE)
-#endif
-VIENNACL_NORM_1_IMPL_2(unsigned long, unsigned long)
-
-
-// float
-VIENNACL_NORM_1_IMPL_1(float, float)
-#ifdef VIENNACL_WITH_OPENMP
-  #pragma omp parallel for reduction(+: temp) if (size1 > 
VIENNACL_OPENMP_VECTOR_MIN_SIZE)
-#endif
-VIENNACL_NORM_1_IMPL_2(float, float)
-
-// double
-VIENNACL_NORM_1_IMPL_1(double, double)
-#ifdef VIENNACL_WITH_OPENMP
-  #pragma omp parallel for reduction(+: temp) if (size1 > 
VIENNACL_OPENMP_VECTOR_MIN_SIZE)
-#endif
-VIENNACL_NORM_1_IMPL_2(double, double)
-
-#undef VIENNACL_NORM_1_IMPL_1
-#undef VIENNACL_NORM_1_IMPL_2
-
-}
-
-/** @brief Computes the l^1-norm of a vector
-*
-* @param vec1 The vector
-* @param result The result scalar
-*/
-template<typename NumericT, typename ScalarT>
-void norm_1_impl(vector_base<NumericT> const & vec1,
-                 ScalarT & result)
-{
-  typedef NumericT        value_type;
-
-  value_type const * data_vec1 = detail::extract_raw_pointer<value_type>(vec1);
-
-  vcl_size_t start1 = viennacl::traits::start(vec1);
-  vcl_size_t inc1   = viennacl::traits::stride(vec1);
-  vcl_size_t size1  = viennacl::traits::size(vec1);
-
-  result = detail::norm_1_impl(data_vec1, start1, inc1, size1);  //Note: 
Assignment to result might be expensive, thus using a temporary for accumulation
-}
-
-
-
-namespace detail
-{
-
-#define VIENNACL_NORM_2_IMPL_1(RESULTSCALART, TEMPSCALART) \
-  inline RESULTSCALART norm_2_impl(RESULTSCALART const * data_vec1, vcl_size_t 
start1, vcl_size_t inc1, vcl_size_t size1) { \
-    TEMPSCALART temp = 0;
-
-#define VIENNACL_NORM_2_IMPL_2(RESULTSCALART, TEMPSCALART) \
-    for (long i = 0; i < static_cast<long>(size1); ++i) { \
-      RESULTSCALART data = data_vec1[static_cast<vcl_size_t>(i)*inc1+start1]; \
-      temp += static_cast<TEMPSCALART>(data * data); \
-    } \
-    return static_cast<RESULTSCALART>(temp); \
-  }
-
-// char
-VIENNACL_NORM_2_IMPL_1(char, int)
-#ifdef VIENNACL_WITH_OPENMP
-  #pragma omp parallel for reduction(+: temp) if (size1 > 
VIENNACL_OPENMP_VECTOR_MIN_SIZE)
-#endif
-VIENNACL_NORM_2_IMPL_2(char, int)
-
-VIENNACL_NORM_2_IMPL_1(unsigned char, int)
-#ifdef VIENNACL_WITH_OPENMP
-  #pragma omp parallel for reduction(+: temp) if (size1 > 
VIENNACL_OPENMP_VECTOR_MIN_SIZE)
-#endif
-VIENNACL_NORM_2_IMPL_2(unsigned char, int)
-
-
-// short
-VIENNACL_NORM_2_IMPL_1(short, int)
-#ifdef VIENNACL_WITH_OPENMP
-  #pragma omp parallel for reduction(+: temp) if (size1 > 
VIENNACL_OPENMP_VECTOR_MIN_SIZE)
-#endif
-VIENNACL_NORM_2_IMPL_2(short, int)
-
-VIENNACL_NORM_2_IMPL_1(unsigned short, int)
-#ifdef VIENNACL_WITH_OPENMP
-  #pragma omp parallel for reduction(+: temp) if (size1 > 
VIENNACL_OPENMP_VECTOR_MIN_SIZE)
-#endif
-VIENNACL_NORM_2_IMPL_2(unsigned short, int)
-
-
-// int
-VIENNACL_NORM_2_IMPL_1(int, int)
-#ifdef VIENNACL_WITH_OPENMP
-  #pragma omp parallel for reduction(+: temp) if (size1 > 
VIENNACL_OPENMP_VECTOR_MIN_SIZE)
-#endif
-VIENNACL_NORM_2_IMPL_2(int, int)
-
-VIENNACL_NORM_2_IMPL_1(unsigned int, unsigned int)
-#ifdef VIENNACL_WITH_OPENMP
-  #pragma omp parallel for reduction(+: temp) if (size1 > 
VIENNACL_OPENMP_VECTOR_MIN_SIZE)
-#endif
-VIENNACL_NORM_2_IMPL_2(unsigned int, unsigned int)
-
-
-// long
-VIENNACL_NORM_2_IMPL_1(long, long)
-#ifdef VIENNACL_WITH_OPENMP
-  #pragma omp parallel for reduction(+: temp) if (size1 > 
VIENNACL_OPENMP_VECTOR_MIN_SIZE)
-#endif
-VIENNACL_NORM_2_IMPL_2(long, long)
-
-VIENNACL_NORM_2_IMPL_1(unsigned long, unsigned long)
-#ifdef VIENNACL_WITH_OPENMP
-  #pragma omp parallel for reduction(+: temp) if (size1 > 
VIENNACL_OPENMP_VECTOR_MIN_SIZE)
-#endif
-VIENNACL_NORM_2_IMPL_2(unsigned long, unsigned long)
-
-
-// float
-VIENNACL_NORM_2_IMPL_1(float, float)
-#ifdef VIENNACL_WITH_OPENMP
-  #pragma omp parallel for reduction(+: temp) if (size1 > 
VIENNACL_OPENMP_VECTOR_MIN_SIZE)
-#endif
-VIENNACL_NORM_2_IMPL_2(float, float)
-
-// double
-VIENNACL_NORM_2_IMPL_1(double, double)
-#ifdef VIENNACL_WITH_OPENMP
-  #pragma omp parallel for reduction(+: temp) if (size1 > 
VIENNACL_OPENMP_VECTOR_MIN_SIZE)
-#endif
-VIENNACL_NORM_2_IMPL_2(double, double)
-
-#undef VIENNACL_NORM_2_IMPL_1
-#undef VIENNACL_NORM_2_IMPL_2
-
-}
-
-
-/** @brief Computes the l^2-norm of a vector - implementation
-*
-* @param vec1 The vector
-* @param result The result scalar
-*/
-template<typename NumericT, typename ScalarT>
-void norm_2_impl(vector_base<NumericT> const & vec1,
-                 ScalarT & result)
-{
-  typedef NumericT       value_type;
-
-  value_type const * data_vec1 = detail::extract_raw_pointer<value_type>(vec1);
-
-  vcl_size_t start1 = viennacl::traits::start(vec1);
-  vcl_size_t inc1   = viennacl::traits::stride(vec1);
-  vcl_size_t size1  = viennacl::traits::size(vec1);
-
-  result = std::sqrt(detail::norm_2_impl(data_vec1, start1, inc1, size1));  
//Note: Assignment to result might be expensive, thus 'temp' is used for 
accumulation
-}
-
-/** @brief Computes the supremum-norm of a vector
-*
-* @param vec1 The vector
-* @param result The result scalar
-*/
-template<typename NumericT, typename ScalarT>
-void norm_inf_impl(vector_base<NumericT> const & vec1,
-                   ScalarT & result)
-{
-  typedef NumericT       value_type;
-
-  value_type const * data_vec1 = detail::extract_raw_pointer<value_type>(vec1);
-
-  vcl_size_t start1 = viennacl::traits::start(vec1);
-  vcl_size_t inc1   = viennacl::traits::stride(vec1);
-  vcl_size_t size1  = viennacl::traits::size(vec1);
-
-  vcl_size_t thread_count=1;
-
-  #ifdef VIENNACL_WITH_OPENMP
-  if(size1 > VIENNACL_OPENMP_VECTOR_MIN_SIZE)
-      thread_count = omp_get_max_threads();
-  #endif
-
-  std::vector<value_type> temp(thread_count);
-
-#ifdef VIENNACL_WITH_OPENMP
-  #pragma omp parallel if (size1 > VIENNACL_OPENMP_VECTOR_MIN_SIZE)
-#endif
-  {
-    vcl_size_t id = 0;
-#ifdef VIENNACL_WITH_OPENMP
-    id = omp_get_thread_num();
-#endif
-
-    vcl_size_t begin = (size1 * id) / thread_count;
-    vcl_size_t end   = (size1 * (id + 1)) / thread_count;
-    temp[id]         = 0;
-
-    for (vcl_size_t i = begin; i < end; ++i)
-      temp[id] = std::max<value_type>(temp[id], 
static_cast<value_type>(std::fabs(static_cast<double>(data_vec1[i*inc1+start1]))));
  //casting to double in order to avoid problems if T is an integer type
-  }
-  for (vcl_size_t i = 1; i < thread_count; ++i)
-    temp[0] = std::max<value_type>( temp[0], temp[i]);
-  result  = temp[0];
-}
-
-//This function should return a CPU scalar, otherwise statements like
-// vcl_rhs[index_norm_inf(vcl_rhs)]
-// are ambiguous
-/** @brief Computes the index of the first entry that is equal to the 
supremum-norm in modulus.
-*
-* @param vec1 The vector
-* @return The result. Note that the result must be a CPU scalar (unsigned 
int), since gpu scalars are floating point types.
-*/
-template<typename NumericT>
-vcl_size_t index_norm_inf(vector_base<NumericT> const & vec1)
-{
-  typedef NumericT      value_type;
-
-  value_type const * data_vec1 = detail::extract_raw_pointer<value_type>(vec1);
-
-  vcl_size_t start1 = viennacl::traits::start(vec1);
-  vcl_size_t inc1   = viennacl::traits::stride(vec1);
-  vcl_size_t size1  = viennacl::traits::size(vec1);
-  vcl_size_t thread_count=1;
-
-#ifdef VIENNACL_WITH_OPENMP
-  if(size1 > VIENNACL_OPENMP_VECTOR_MIN_SIZE)
-      thread_count = omp_get_max_threads();
-#endif
-
-  std::vector<value_type> temp(thread_count);
-  std::vector<vcl_size_t> index(thread_count);
-
-#ifdef VIENNACL_WITH_OPENMP
-  #pragma omp parallel if (size1 > VIENNACL_OPENMP_VECTOR_MIN_SIZE)
-#endif
-  {
-    vcl_size_t id = 0;
-#ifdef VIENNACL_WITH_OPENMP
-    id = omp_get_thread_num();
-#endif
-    vcl_size_t begin = (size1 * id) / thread_count;
-    vcl_size_t end   = (size1 * (id + 1)) / thread_count;
-    index[id]        = start1;
-    temp[id]         = 0;
-    value_type data;
-
-    for (vcl_size_t i = begin; i < end; ++i)
-    {
-      data = 
static_cast<value_type>(std::fabs(static_cast<double>(data_vec1[i*inc1+start1])));
  //casting to double in order to avoid problems if T is an integer type
-      if (data > temp[id])
-      {
-        index[id] = i;
-        temp[id]  = data;
-      }
-    }
-  }
-  for (vcl_size_t i = 1; i < thread_count; ++i)
-  {
-    if (temp[i] > temp[0])
-    {
-      index[0] = index[i];
-      temp[0] = temp[i];
-    }
-  }
-  return index[0];
-}
-
-/** @brief Computes the maximum of a vector
-*
-* @param vec1 The vector
-* @param result The result scalar
-*/
-template<typename NumericT, typename ScalarT>
-void max_impl(vector_base<NumericT> const & vec1,
-              ScalarT & result)
-{
-  typedef NumericT       value_type;
-
-  value_type const * data_vec1 = detail::extract_raw_pointer<value_type>(vec1);
-
-  vcl_size_t start1 = viennacl::traits::start(vec1);
-  vcl_size_t inc1   = viennacl::traits::stride(vec1);
-  vcl_size_t size1  = viennacl::traits::size(vec1);
-
-  vcl_size_t thread_count=1;
-
-#ifdef VIENNACL_WITH_OPENMP
-  if(size1 > VIENNACL_OPENMP_VECTOR_MIN_SIZE)
-      thread_count = omp_get_max_threads();
-#endif
-
-  std::vector<value_type> temp(thread_count);
-
-#ifdef VIENNACL_WITH_OPENMP
-  #pragma omp parallel if (size1 > VIENNACL_OPENMP_VECTOR_MIN_SIZE)
-#endif
-  {
-    vcl_size_t id = 0;
-#ifdef VIENNACL_WITH_OPENMP
-    id = omp_get_thread_num();
-#endif
-    vcl_size_t begin = (size1 * id) / thread_count;
-    vcl_size_t end   = (size1 * (id + 1)) / thread_count;
-    temp[id]         = data_vec1[start1];
-
-    for (vcl_size_t i = begin; i < end; ++i)
-    {
-      value_type v = data_vec1[i*inc1+start1];//Note: Assignment to 'vec1' in 
std::min might be expensive, thus 'v' is used for the function
-      temp[id] = std::max<value_type>(temp[id],v);
-    }
-  }
-  for (vcl_size_t i = 1; i < thread_count; ++i)
-    temp[0] = std::max<value_type>( temp[0], temp[i]);
-  result  = temp[0];//Note: Assignment to result might be expensive, thus 
'temp' is used for accumulation
-}
-
-/** @brief Computes the minimum of a vector
-*
-* @param vec1 The vector
-* @param result The result scalar
-*/
-template<typename NumericT, typename ScalarT>
-void min_impl(vector_base<NumericT> const & vec1,
-              ScalarT & result)
-{
-  typedef NumericT       value_type;
-
-  value_type const * data_vec1 = detail::extract_raw_pointer<value_type>(vec1);
-
-  vcl_size_t start1 = viennacl::traits::start(vec1);
-  vcl_size_t inc1   = viennacl::traits::stride(vec1);
-  vcl_size_t size1  = viennacl::traits::size(vec1);
-
-  vcl_size_t thread_count=1;
-
-#ifdef VIENNACL_WITH_OPENMP
-  if(size1 > VIENNACL_OPENMP_VECTOR_MIN_SIZE)
-      thread_count = omp_get_max_threads();
-#endif
-
-  std::vector<value_type> temp(thread_count);
-
-#ifdef VIENNACL_WITH_OPENMP
-  #pragma omp parallel if (size1 > VIENNACL_OPENMP_VECTOR_MIN_SIZE)
-#endif
-  {
-    vcl_size_t id = 0;
-#ifdef VIENNACL_WITH_OPENMP
-    id = omp_get_thread_num();
-#endif
-    vcl_size_t begin = (size1 * id) / thread_count;
-    vcl_size_t end   = (size1 * (id + 1)) / thread_count;
-    temp[id]         = data_vec1[start1];
-
-    for (vcl_size_t i = begin; i < end; ++i)
-    {
-      value_type v = data_vec1[i*inc1+start1];//Note: Assignment to 'vec1' in 
std::min might be expensive, thus 'v' is used for the function
-      temp[id] = std::min<value_type>(temp[id],v);
-    }
-  }
-  for (vcl_size_t i = 1; i < thread_count; ++i)
-    temp[0] = std::min<value_type>( temp[0], temp[i]);
-  result  = temp[0];//Note: Assignment to result might be expensive, thus 
'temp' is used for accumulation
-}
-
-/** @brief Computes the sum of all elements from the vector
-*
-* @param vec1 The vector
-* @param result The result scalar
-*/
-template<typename NumericT, typename ScalarT>
-void sum_impl(vector_base<NumericT> const & vec1,
-              ScalarT & result)
-{
-  typedef NumericT       value_type;
-
-  value_type const * data_vec1 = detail::extract_raw_pointer<value_type>(vec1);
-
-  vcl_size_t start1 = viennacl::traits::start(vec1);
-  vcl_size_t inc1   = viennacl::traits::stride(vec1);
-  vcl_size_t size1  = viennacl::traits::size(vec1);
-
-  value_type temp = 0;
-#ifdef VIENNACL_WITH_OPENMP
-  #pragma omp parallel for reduction(+:temp) if (size1 > 
VIENNACL_OPENMP_VECTOR_MIN_SIZE)
-#endif
-  for (long i = 0; i < static_cast<long>(size1); ++i)
-    temp += data_vec1[static_cast<vcl_size_t>(i)*inc1+start1];
-
-  result = temp;  //Note: Assignment to result might be expensive, thus 'temp' 
is used for accumulation
-}
-
-/** @brief Computes a plane rotation of two vectors.
-*
-* Computes (x,y) <- (alpha * x + beta * y, -beta * x + alpha * y)
-*
-* @param vec1   The first vector
-* @param vec2   The second vector
-* @param alpha  The first transformation coefficient
-* @param beta   The second transformation coefficient
-*/
-template<typename NumericT>
-void plane_rotation(vector_base<NumericT> & vec1,
-                    vector_base<NumericT> & vec2,
-                    NumericT alpha, NumericT beta)
-{
-  typedef NumericT  value_type;
-
-  value_type * data_vec1 = detail::extract_raw_pointer<value_type>(vec1);
-  value_type * data_vec2 = detail::extract_raw_pointer<value_type>(vec2);
-
-  vcl_size_t start1 = viennacl::traits::start(vec1);
-  vcl_size_t inc1   = viennacl::traits::stride(vec1);
-  vcl_size_t size1  = viennacl::traits::size(vec1);
-
-  vcl_size_t start2 = viennacl::traits::start(vec2);
-  vcl_size_t inc2   = viennacl::traits::stride(vec2);
-
-  value_type data_alpha = alpha;
-  value_type data_beta  = beta;
-
-#ifdef VIENNACL_WITH_OPENMP
-  #pragma omp parallel for if (size1 > VIENNACL_OPENMP_VECTOR_MIN_SIZE)
-#endif
-  for (long i = 0; i < static_cast<long>(size1); ++i)
-  {
-    value_type temp1 = data_vec1[static_cast<vcl_size_t>(i)*inc1+start1];
-    value_type temp2 = data_vec2[static_cast<vcl_size_t>(i)*inc2+start2];
-
-    data_vec1[static_cast<vcl_size_t>(i)*inc1+start1] = data_alpha * temp1 + 
data_beta * temp2;
-    data_vec2[static_cast<vcl_size_t>(i)*inc2+start2] = data_alpha * temp2 - 
data_beta * temp1;
-  }
-}
-
-namespace detail
-{
-  /** @brief Implementation of inclusive_scan and exclusive_scan for the host 
(OpenMP) backend. */
-  template<typename NumericT>
-  void vector_scan_impl(vector_base<NumericT> const & vec1,
-                        vector_base<NumericT>       & vec2,
-                        bool is_inclusive)
-  {
-    NumericT const * data_vec1 = detail::extract_raw_pointer<NumericT>(vec1);
-    NumericT       * data_vec2 = detail::extract_raw_pointer<NumericT>(vec2);
-
-    vcl_size_t start1 = viennacl::traits::start(vec1);
-    vcl_size_t inc1   = viennacl::traits::stride(vec1);
-    vcl_size_t size1  = viennacl::traits::size(vec1);
-    if (size1 < 1)
-      return;
-
-    vcl_size_t start2 = viennacl::traits::start(vec2);
-    vcl_size_t inc2   = viennacl::traits::stride(vec2);
-
-#ifdef VIENNACL_WITH_OPENMP
-    if (size1 > VIENNACL_OPENMP_VECTOR_MIN_SIZE)
-    {
-      std::vector<NumericT> thread_results(omp_get_max_threads());
-
-      // inclusive scan each thread segment:
-      #pragma omp parallel
-      {
-        vcl_size_t work_per_thread = (size1 - 1) / thread_results.size() + 1;
-        vcl_size_t thread_start = work_per_thread * omp_get_thread_num();
-        vcl_size_t thread_stop  = std::min<vcl_size_t>(thread_start + 
work_per_thread, size1);
-
-        NumericT thread_sum = 0;
-        for(vcl_size_t i = thread_start; i < thread_stop; i++)
-          thread_sum += data_vec1[i * inc1 + start1];
-
-        thread_results[omp_get_thread_num()] = thread_sum;
-      }
-
-      // exclusive-scan of thread results:
-      NumericT current_offset = 0;
-      for (vcl_size_t i=0; i<thread_results.size(); ++i)
-      {
-        NumericT tmp = thread_results[i];
-        thread_results[i] = current_offset;
-        current_offset += tmp;
-      }
-
-      // exclusive/inclusive scan of each segment with correct offset:
-      #pragma omp parallel
-      {
-        vcl_size_t work_per_thread = (size1 - 1) / thread_results.size() + 1;
-        vcl_size_t thread_start = work_per_thread * omp_get_thread_num();
-        vcl_size_t thread_stop  = std::min<vcl_size_t>(thread_start + 
work_per_thread, size1);
-
-        NumericT thread_sum = thread_results[omp_get_thread_num()];
-        if (is_inclusive)
-        {
-          for(vcl_size_t i = thread_start; i < thread_stop; i++)
-          {
-            thread_sum += data_vec1[i * inc1 + start1];
-            data_vec2[i * inc2 + start2] = thread_sum;
-          }
-        }
-        else
-        {
-          for(vcl_size_t i = thread_start; i < thread_stop; i++)
-          {
-            NumericT tmp = data_vec1[i * inc1 + start1];
-            data_vec2[i * inc2 + start2] = thread_sum;
-            thread_sum += tmp;
-          }
-        }
-      }
-    } else
-#endif
-    {
-      NumericT sum = 0;
-      if (is_inclusive)
-      {
-        for(vcl_size_t i = 0; i < size1; i++)
-        {
-          sum += data_vec1[i * inc1 + start1];
-          data_vec2[i * inc2 + start2] = sum;
-        }
-      }
-      else
-      {
-        for(vcl_size_t i = 0; i < size1; i++)
-        {
-          NumericT tmp = data_vec1[i * inc1 + start1];
-          data_vec2[i * inc2 + start2] = sum;
-          sum += tmp;
-        }
-      }
-    }
-
-  }
-}
-
-/** @brief This function implements an inclusive scan on the host using OpenMP.
-*
-* Given an element vector (x_0, x_1, ..., x_{n-1}),
-* this routine computes (x_0, x_0 + x_1, ..., x_0 + x_1 + ... + x_{n-1})
-*
-* @param vec1       Input vector: Gets overwritten by the routine.
-* @param vec2       The output vector. Either idential to vec1 or 
non-overlapping.
-*/
-template<typename NumericT>
-void inclusive_scan(vector_base<NumericT> const & vec1,
-                    vector_base<NumericT>       & vec2)
-{
-  detail::vector_scan_impl(vec1, vec2, true);
-}
-
-/** @brief This function implements an exclusive scan on the host using OpenMP.
-*
-* Given an element vector (x_0, x_1, ..., x_{n-1}),
-* this routine computes (0, x_0, x_0 + x_1, ..., x_0 + x_1 + ... + x_{n-2})
-*
-* @param vec1       Input vector: Gets overwritten by the routine.
-* @param vec2       The output vector. Either idential to vec1 or 
non-overlapping.
-*/
-template<typename NumericT>
-void exclusive_scan(vector_base<NumericT> const & vec1,
-                    vector_base<NumericT>       & vec2)
-{
-  detail::vector_scan_impl(vec1, vec2, false);
-}
-
-
-} //namespace host_based
-} //namespace linalg
-} //namespace viennacl
-
-
-#endif

http://git-wip-us.apache.org/repos/asf/mahout/blob/7ae549fa/native-viennaCL/src/main/cpp/viennacl/linalg/ichol.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/ichol.hpp 
b/native-viennaCL/src/main/cpp/viennacl/linalg/ichol.hpp
deleted file mode 100644
index 1038b2b..0000000
--- a/native-viennaCL/src/main/cpp/viennacl/linalg/ichol.hpp
+++ /dev/null
@@ -1,228 +0,0 @@
-#ifndef VIENNACL_LINALG_ICHOL_HPP_
-#define VIENNACL_LINALG_ICHOL_HPP_
-
-/* =========================================================================
-   Copyright (c) 2010-2016, Institute for Microelectronics,
-                            Institute for Analysis and Scientific Computing,
-                            TU Wien.
-   Portions of this software are copyright by UChicago Argonne, LLC.
-
-                            -----------------
-                  ViennaCL - The Vienna Computing Library
-                            -----------------
-
-   Project Head:    Karl Rupp                   [email protected]
-
-   (A list of authors and contributors can be found in the manual)
-
-   License:         MIT (X11), see file LICENSE in the base directory
-============================================================================= 
*/
-
-/** @file viennacl/linalg/ichol.hpp
-  @brief Implementations of incomplete Cholesky factorization preconditioners 
with static nonzero pattern.
-*/
-
-#include <vector>
-#include <cmath>
-#include <iostream>
-#include "viennacl/forwards.h"
-#include "viennacl/tools/tools.hpp"
-#include "viennacl/compressed_matrix.hpp"
-
-#include "viennacl/linalg/host_based/common.hpp"
-
-#include <map>
-
-namespace viennacl
-{
-namespace linalg
-{
-
-/** @brief A tag for incomplete Cholesky factorization with static pattern 
(ILU0)
-*/
-class ichol0_tag {};
-
-
-/** @brief Implementation of a ILU-preconditioner with static pattern. 
Optimized version for CSR matrices.
-  *
-  *  Refer to Chih-Jen Lin and Jorge J. Moré, Incomplete Cholesky 
Factorizations with Limited Memory, SIAM J. Sci. Comput., 21(1), 24–45
-  *  for one of many descriptions of incomplete Cholesky Factorizations
-  *
-  *  @param A       The input matrix in CSR format
-  *  // param tag     An ichol0_tag in order to dispatch among several other 
preconditioners.
-  */
-template<typename NumericT>
-void precondition(viennacl::compressed_matrix<NumericT> & A, ichol0_tag const 
& /* tag */)
-{
-  assert( (viennacl::traits::context(A).memory_type() == 
viennacl::MAIN_MEMORY) && bool("System matrix must reside in main memory for 
ICHOL0") );
-
-  NumericT           * elements   = 
viennacl::linalg::host_based::detail::extract_raw_pointer<NumericT>(A.handle());
-  unsigned int const * row_buffer = 
viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned 
int>(A.handle1());
-  unsigned int const * col_buffer = 
viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned 
int>(A.handle2());
-
-  //std::cout << A.size1() << std::endl;
-  for (vcl_size_t i=0; i<A.size1(); ++i)
-  {
-    unsigned int row_i_begin = row_buffer[i];
-    unsigned int row_i_end   = row_buffer[i+1];
-
-    // get a_ii:
-    NumericT a_ii = 0;
-    for (unsigned int buf_index_aii = row_i_begin; buf_index_aii < row_i_end; 
++buf_index_aii)
-    {
-      if (col_buffer[buf_index_aii] == i)
-      {
-        a_ii = std::sqrt(elements[buf_index_aii]);
-        elements[buf_index_aii] = a_ii;
-        break;
-      }
-    }
-
-    // Now scale column/row i, i.e. A(k, i) /= A(i, i)
-    for (unsigned int buf_index_aii = row_i_begin; buf_index_aii < row_i_end; 
++buf_index_aii)
-    {
-      if (col_buffer[buf_index_aii] > i)
-        elements[buf_index_aii] /= a_ii;
-    }
-
-    // Now compute A(k, j) -= A(k, i) * A(j, i) for all nonzero k, j in column 
i:
-    for (unsigned int buf_index_j = row_i_begin; buf_index_j < row_i_end; 
++buf_index_j)
-    {
-      unsigned int j = col_buffer[buf_index_j];
-      if (j <= i)
-        continue;
-
-      NumericT a_ji = elements[buf_index_j];
-
-      for (unsigned int buf_index_k = row_i_begin; buf_index_k < row_i_end; 
++buf_index_k)
-      {
-        unsigned int k = col_buffer[buf_index_k];
-        if (k < j)
-          continue;
-
-        NumericT a_ki = elements[buf_index_k];
-
-        //Now check whether A(k, j) is in nonzero pattern:
-        unsigned int row_j_begin = row_buffer[j];
-        unsigned int row_j_end   = row_buffer[j+1];
-        for (unsigned int buf_index_kj = row_j_begin; buf_index_kj < 
row_j_end; ++buf_index_kj)
-        {
-          if (col_buffer[buf_index_kj] == k)
-          {
-            elements[buf_index_kj] -= a_ki * a_ji;
-            break;
-          }
-        }
-      }
-    }
-
-  }
-
-}
-
-
-/** @brief Incomplete Cholesky preconditioner class with static pattern 
(ICHOL0), can be supplied to solve()-routines
-*/
-template<typename MatrixT>
-class ichol0_precond
-{
-  typedef typename MatrixT::value_type      NumericType;
-
-public:
-  ichol0_precond(MatrixT const & mat, ichol0_tag const & tag) : tag_(tag), 
LLT(mat.size1(), mat.size2(), viennacl::context(viennacl::MAIN_MEMORY))
-  {
-      //initialize preconditioner:
-      //std::cout << "Start CPU precond" << std::endl;
-      init(mat);
-      //std::cout << "End CPU precond" << std::endl;
-  }
-
-  template<typename VectorT>
-  void apply(VectorT & vec) const
-  {
-    unsigned int const * row_buffer = 
viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned 
int>(LLT.handle1());
-    unsigned int const * col_buffer = 
viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned 
int>(LLT.handle2());
-    NumericType  const * elements   = 
viennacl::linalg::host_based::detail::extract_raw_pointer<NumericType>(LLT.handle());
-
-    // Note: L is stored in a column-oriented fashion, i.e. transposed w.r.t. 
the row-oriented layout. Thus, the factorization A = L L^T holds L in the upper 
triangular part of A.
-    
viennacl::linalg::host_based::detail::csr_trans_inplace_solve<NumericType>(row_buffer,
 col_buffer, elements, vec, LLT.size2(), lower_tag());
-    
viennacl::linalg::host_based::detail::csr_inplace_solve<NumericType>(row_buffer,
 col_buffer, elements, vec, LLT.size2(), upper_tag());
-  }
-
-private:
-  void init(MatrixT const & mat)
-  {
-    viennacl::context host_ctx(viennacl::MAIN_MEMORY);
-    viennacl::switch_memory_context(LLT, host_ctx);
-
-    viennacl::copy(mat, LLT);
-    viennacl::linalg::precondition(LLT, tag_);
-  }
-
-  ichol0_tag const & tag_;
-  viennacl::compressed_matrix<NumericType> LLT;
-};
-
-
-/** @brief ILU0 preconditioner class, can be supplied to solve()-routines.
-*
-*  Specialization for compressed_matrix
-*/
-template<typename NumericT, unsigned int AlignmentV>
-class ichol0_precond< compressed_matrix<NumericT, AlignmentV> >
-{
-  typedef compressed_matrix<NumericT, AlignmentV>   MatrixType;
-
-public:
-  ichol0_precond(MatrixType const & mat, ichol0_tag const & tag) : tag_(tag), 
LLT(mat.size1(), mat.size2(), viennacl::traits::context(mat))
-  {
-    //initialize preconditioner:
-    //std::cout << "Start GPU precond" << std::endl;
-    init(mat);
-    //std::cout << "End GPU precond" << std::endl;
-  }
-
-  void apply(vector<NumericT> & vec) const
-  {
-    if (viennacl::traits::context(vec).memory_type() != viennacl::MAIN_MEMORY)
-    {
-      viennacl::context host_ctx(viennacl::MAIN_MEMORY);
-      viennacl::context old_ctx = viennacl::traits::context(vec);
-
-      viennacl::switch_memory_context(vec, host_ctx);
-      viennacl::linalg::inplace_solve(trans(LLT), vec, lower_tag());
-      viennacl::linalg::inplace_solve(      LLT , vec, upper_tag());
-      viennacl::switch_memory_context(vec, old_ctx);
-    }
-    else //apply ILU0 directly:
-    {
-      // Note: L is stored in a column-oriented fashion, i.e. transposed 
w.r.t. the row-oriented layout. Thus, the factorization A = L L^T holds L in 
the upper triangular part of A.
-      viennacl::linalg::inplace_solve(trans(LLT), vec, lower_tag());
-      viennacl::linalg::inplace_solve(      LLT , vec, upper_tag());
-    }
-  }
-
-private:
-  void init(MatrixType const & mat)
-  {
-    viennacl::context host_ctx(viennacl::MAIN_MEMORY);
-    viennacl::switch_memory_context(LLT, host_ctx);
-    LLT = mat;
-
-    viennacl::linalg::precondition(LLT, tag_);
-  }
-
-  ichol0_tag const & tag_;
-  viennacl::compressed_matrix<NumericT> LLT;
-};
-
-}
-}
-
-
-
-
-#endif
-
-
-

Reply via email to