http://git-wip-us.apache.org/repos/asf/mahout/blob/7ae549fa/native-viennaCL/src/main/cpp/viennacl/linalg/host_based/spgemm_vector.hpp ---------------------------------------------------------------------- diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/host_based/spgemm_vector.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/host_based/spgemm_vector.hpp deleted file mode 100644 index 56e3c14..0000000 --- a/native-viennaCL/src/main/cpp/viennacl/linalg/host_based/spgemm_vector.hpp +++ /dev/null @@ -1,705 +0,0 @@ -#ifndef VIENNACL_LINALG_HOST_BASED_SPGEMM_VECTOR_HPP_ -#define VIENNACL_LINALG_HOST_BASED_SPGEMM_VECTOR_HPP_ - -/* ========================================================================= - Copyright (c) 2010-2016, Institute for Microelectronics, - Institute for Analysis and Scientific Computing, - TU Wien. - Portions of this software are copyright by UChicago Argonne, LLC. - - ----------------- - ViennaCL - The Vienna Computing Library - ----------------- - - Project Head: Karl Rupp [email protected] - - (A list of authors and contributors can be found in the manual) - - License: MIT (X11), see file LICENSE in the base directory -============================================================================= */ - -/** @file viennacl/linalg/host_based/sparse_matrix_operations.hpp - @brief Implementations of operations using sparse matrices on the CPU using a single thread or OpenMP. -*/ - -#include "viennacl/forwards.h" -#include "viennacl/linalg/host_based/common.hpp" - - -#ifdef VIENNACL_WITH_AVX2 -#include "immintrin.h" -#endif - - -namespace viennacl -{ -namespace linalg -{ -namespace host_based -{ - - - -#ifdef VIENNACL_WITH_AVX2 -inline -unsigned int row_C_scan_symbolic_vector_AVX2(int const *row_indices_B_begin, int const *row_indices_B_end, - int const *B_row_buffer, int const *B_col_buffer, int B_size2, - int *row_C_vector_output) -{ - __m256i avx_all_ones = _mm256_set_epi32(1, 1, 1, 1, 1, 1, 1, 1); - __m256i avx_all_bsize2 = _mm256_set_epi32(B_size2, B_size2, B_size2, B_size2, B_size2, B_size2, B_size2, B_size2); - - __m256i avx_row_indices_offsets = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0); - __m256i avx_load_mask = _mm256_sub_epi32(avx_row_indices_offsets, _mm256_set1_epi32(row_indices_B_end - row_indices_B_begin)); - __m256i avx_load_mask2 = avx_load_mask; - - __m256i avx_row_indices = _mm256_set1_epi32(0); - avx_row_indices = _mm256_mask_i32gather_epi32(avx_row_indices, row_indices_B_begin, avx_row_indices_offsets, avx_load_mask, 4); - avx_load_mask = avx_load_mask2; // reload mask (destroyed by gather) - __m256i avx_row_start = _mm256_mask_i32gather_epi32(avx_all_ones, B_row_buffer, avx_row_indices, avx_load_mask, 4); - avx_load_mask = avx_load_mask2; // reload mask (destroyed by gather) - __m256i avx_row_end = _mm256_mask_i32gather_epi32(avx_all_ones, B_row_buffer+1, avx_row_indices, avx_load_mask, 4); - - avx_load_mask = _mm256_cmpgt_epi32(avx_row_end, avx_row_start); - __m256i avx_index_front = avx_all_bsize2; - avx_index_front = _mm256_mask_i32gather_epi32(avx_index_front, B_col_buffer, avx_row_start, avx_load_mask, 4); - - int *output_ptr = row_C_vector_output; - - while (1) - { - // get minimum index in current front: - __m256i avx_index_min1 = avx_index_front; - __m256i avx_temp = _mm256_permutevar8x32_epi32(avx_index_min1, _mm256_set_epi32(3, 2, 1, 0, 7, 6, 5, 4)); - avx_index_min1 = _mm256_min_epi32(avx_index_min1, avx_temp); // first four elements compared against last four elements - - avx_temp = _mm256_shuffle_epi32(avx_index_min1, int(78)); // 0b01001110 = 78, using shuffle instead of permutevar here because of lower latency - avx_index_min1 = _mm256_min_epi32(avx_index_min1, avx_temp); // first two elements compared against elements three and four (same for upper half of register) - - avx_temp = _mm256_shuffle_epi32(avx_index_min1, int(177)); // 0b10110001 = 177, using shuffle instead of permutevar here because of lower latency - avx_index_min1 = _mm256_min_epi32(avx_index_min1, avx_temp); // now all entries of avx_index_min1 hold the minimum - - int min_index_in_front = ((int*)&avx_index_min1)[0]; - // check for end of merge operation: - if (min_index_in_front == B_size2) - break; - - // write current entry: - *output_ptr = min_index_in_front; - ++output_ptr; - - // advance index front where equal to minimum index: - avx_load_mask = _mm256_cmpeq_epi32(avx_index_front, avx_index_min1); - // first part: set index to B_size2 if equal to minimum index: - avx_temp = _mm256_and_si256(avx_all_bsize2, avx_load_mask); - avx_index_front = _mm256_max_epi32(avx_index_front, avx_temp); - // second part: increment row_start registers where minimum found: - avx_temp = _mm256_and_si256(avx_all_ones, avx_load_mask); //ones only where the minimum was found - avx_row_start = _mm256_add_epi32(avx_row_start, avx_temp); - // third part part: load new data where more entries available: - avx_load_mask = _mm256_cmpgt_epi32(avx_row_end, avx_row_start); - avx_index_front = _mm256_mask_i32gather_epi32(avx_index_front, B_col_buffer, avx_row_start, avx_load_mask, 4); - } - - return static_cast<unsigned int>(output_ptr - row_C_vector_output); -} -#endif - -/** @brief Merges up to IndexNum rows from B into the result buffer. -* -* Because the input buffer also needs to be considered, this routine actually works on an index front of length (IndexNum+1) -**/ -template<unsigned int IndexNum> -unsigned int row_C_scan_symbolic_vector_N(unsigned int const *row_indices_B, - unsigned int const *B_row_buffer, unsigned int const *B_col_buffer, unsigned int B_size2, - unsigned int const *row_C_vector_input, unsigned int const *row_C_vector_input_end, - unsigned int *row_C_vector_output) -{ - unsigned int index_front[IndexNum+1]; - unsigned int const *index_front_start[IndexNum+1]; - unsigned int const *index_front_end[IndexNum+1]; - - // Set up pointers for loading the indices: - for (unsigned int i=0; i<IndexNum; ++i, ++row_indices_B) - { - index_front_start[i] = B_col_buffer + B_row_buffer[*row_indices_B]; - index_front_end[i] = B_col_buffer + B_row_buffer[*row_indices_B + 1]; - } - index_front_start[IndexNum] = row_C_vector_input; - index_front_end[IndexNum] = row_C_vector_input_end; - - // load indices: - for (unsigned int i=0; i<=IndexNum; ++i) - index_front[i] = (index_front_start[i] < index_front_end[i]) ? *index_front_start[i] : B_size2; - - unsigned int *output_ptr = row_C_vector_output; - - while (1) - { - // get minimum index in current front: - unsigned int min_index_in_front = B_size2; - for (unsigned int i=0; i<=IndexNum; ++i) - min_index_in_front = std::min(min_index_in_front, index_front[i]); - - if (min_index_in_front == B_size2) // we're done - break; - - // advance index front where equal to minimum index: - for (unsigned int i=0; i<=IndexNum; ++i) - { - if (index_front[i] == min_index_in_front) - { - index_front_start[i] += 1; - index_front[i] = (index_front_start[i] < index_front_end[i]) ? *index_front_start[i] : B_size2; - } - } - - // write current entry: - *output_ptr = min_index_in_front; - ++output_ptr; - } - - return static_cast<unsigned int>(output_ptr - row_C_vector_output); -} - -struct spgemm_output_write_enabled { static void apply(unsigned int *ptr, unsigned int value) { *ptr = value; } }; -struct spgemm_output_write_disabled { static void apply(unsigned int * , unsigned int ) { } }; - -template<typename OutputWriterT> -unsigned int row_C_scan_symbolic_vector_1(unsigned int const *input1_begin, unsigned int const *input1_end, - unsigned int const *input2_begin, unsigned int const *input2_end, - unsigned int termination_index, - unsigned int *output_begin) -{ - unsigned int *output_ptr = output_begin; - - unsigned int val_1 = (input1_begin < input1_end) ? *input1_begin : termination_index; - unsigned int val_2 = (input2_begin < input2_end) ? *input2_begin : termination_index; - while (1) - { - unsigned int min_index = std::min(val_1, val_2); - - if (min_index == termination_index) - break; - - if (min_index == val_1) - { - ++input1_begin; - val_1 = (input1_begin < input1_end) ? *input1_begin : termination_index; - } - - if (min_index == val_2) - { - ++input2_begin; - val_2 = (input2_begin < input2_end) ? *input2_begin : termination_index; - } - - // write current entry: - OutputWriterT::apply(output_ptr, min_index); // *output_ptr = min_index; if necessary - ++output_ptr; - } - - return static_cast<unsigned int>(output_ptr - output_begin); -} - -inline -unsigned int row_C_scan_symbolic_vector(unsigned int row_start_A, unsigned int row_end_A, unsigned int const *A_col_buffer, - unsigned int const *B_row_buffer, unsigned int const *B_col_buffer, unsigned int B_size2, - unsigned int *row_C_vector_1, unsigned int *row_C_vector_2, unsigned int *row_C_vector_3) -{ - // Trivial case: row length 0: - if (row_start_A == row_end_A) - return 0; - - // Trivial case: row length 1: - if (row_end_A - row_start_A == 1) - { - unsigned int A_col = A_col_buffer[row_start_A]; - return B_row_buffer[A_col + 1] - B_row_buffer[A_col]; - } - - // Optimizations for row length 2: - unsigned int row_C_len = 0; - if (row_end_A - row_start_A == 2) - { - unsigned int A_col_1 = A_col_buffer[row_start_A]; - unsigned int A_col_2 = A_col_buffer[row_start_A + 1]; - return row_C_scan_symbolic_vector_1<spgemm_output_write_disabled>(B_col_buffer + B_row_buffer[A_col_1], B_col_buffer + B_row_buffer[A_col_1 + 1], - B_col_buffer + B_row_buffer[A_col_2], B_col_buffer + B_row_buffer[A_col_2 + 1], - B_size2, - row_C_vector_1); - } - else // for more than two rows we can safely merge the first two: - { -#ifdef VIENNACL_WITH_AVX2 - row_C_len = row_C_scan_symbolic_vector_AVX2((const int*)(A_col_buffer + row_start_A), (const int*)(A_col_buffer + row_end_A), - (const int*)B_row_buffer, (const int*)B_col_buffer, int(B_size2), - (int*)row_C_vector_1); - row_start_A += 8; -#else - unsigned int A_col_1 = A_col_buffer[row_start_A]; - unsigned int A_col_2 = A_col_buffer[row_start_A + 1]; - row_C_len = row_C_scan_symbolic_vector_1<spgemm_output_write_enabled>(B_col_buffer + B_row_buffer[A_col_1], B_col_buffer + B_row_buffer[A_col_1 + 1], - B_col_buffer + B_row_buffer[A_col_2], B_col_buffer + B_row_buffer[A_col_2 + 1], - B_size2, - row_C_vector_1); - row_start_A += 2; -#endif - } - - // all other row lengths: - while (row_end_A > row_start_A) - { -#ifdef VIENNACL_WITH_AVX2 - if (row_end_A - row_start_A > 2) // we deal with one or two remaining rows more efficiently below: - { - unsigned int merged_len = row_C_scan_symbolic_vector_AVX2((const int*)(A_col_buffer + row_start_A), (const int*)(A_col_buffer + row_end_A), - (const int*)B_row_buffer, (const int*)B_col_buffer, int(B_size2), - (int*)row_C_vector_3); - if (row_start_A + 8 >= row_end_A) - row_C_len = row_C_scan_symbolic_vector_1<spgemm_output_write_disabled>(row_C_vector_3, row_C_vector_3 + merged_len, - row_C_vector_1, row_C_vector_1 + row_C_len, - B_size2, - row_C_vector_2); - else - row_C_len = row_C_scan_symbolic_vector_1<spgemm_output_write_enabled>(row_C_vector_3, row_C_vector_3 + merged_len, - row_C_vector_1, row_C_vector_1 + row_C_len, - B_size2, - row_C_vector_2); - row_start_A += 8; - } - else -#endif - if (row_start_A == row_end_A - 1) // last merge operation. No need to write output - { - // process last row - unsigned int row_index_B = A_col_buffer[row_start_A]; - return row_C_scan_symbolic_vector_1<spgemm_output_write_disabled>(B_col_buffer + B_row_buffer[row_index_B], B_col_buffer + B_row_buffer[row_index_B + 1], - row_C_vector_1, row_C_vector_1 + row_C_len, - B_size2, - row_C_vector_2); - } - else if (row_start_A + 1 < row_end_A)// at least two more rows left, so merge them - { - // process single row: - unsigned int A_col_1 = A_col_buffer[row_start_A]; - unsigned int A_col_2 = A_col_buffer[row_start_A + 1]; - unsigned int merged_len = row_C_scan_symbolic_vector_1<spgemm_output_write_enabled>(B_col_buffer + B_row_buffer[A_col_1], B_col_buffer + B_row_buffer[A_col_1 + 1], - B_col_buffer + B_row_buffer[A_col_2], B_col_buffer + B_row_buffer[A_col_2 + 1], - B_size2, - row_C_vector_3); - if (row_start_A + 2 == row_end_A) // last merge does not need a write: - return row_C_scan_symbolic_vector_1<spgemm_output_write_disabled>(row_C_vector_3, row_C_vector_3 + merged_len, - row_C_vector_1, row_C_vector_1 + row_C_len, - B_size2, - row_C_vector_2); - else - row_C_len = row_C_scan_symbolic_vector_1<spgemm_output_write_enabled>(row_C_vector_3, row_C_vector_3 + merged_len, - row_C_vector_1, row_C_vector_1 + row_C_len, - B_size2, - row_C_vector_2); - row_start_A += 2; - } - else // at least two more rows left - { - // process single row: - unsigned int row_index_B = A_col_buffer[row_start_A]; - row_C_len = row_C_scan_symbolic_vector_1<spgemm_output_write_enabled>(B_col_buffer + B_row_buffer[row_index_B], B_col_buffer + B_row_buffer[row_index_B + 1], - row_C_vector_1, row_C_vector_1 + row_C_len, - B_size2, - row_C_vector_2); - ++row_start_A; - } - - std::swap(row_C_vector_1, row_C_vector_2); - } - - return row_C_len; -} - -////////////////////////////// - -/** @brief Merges up to IndexNum rows from B into the result buffer. -* -* Because the input buffer also needs to be considered, this routine actually works on an index front of length (IndexNum+1) -**/ -template<unsigned int IndexNum, typename NumericT> -unsigned int row_C_scan_numeric_vector_N(unsigned int const *row_indices_B, NumericT const *val_A, - unsigned int const *B_row_buffer, unsigned int const *B_col_buffer, NumericT const *B_elements, unsigned int B_size2, - unsigned int const *row_C_vector_input, unsigned int const *row_C_vector_input_end, NumericT *row_C_vector_input_values, - unsigned int *row_C_vector_output, NumericT *row_C_vector_output_values) -{ - unsigned int index_front[IndexNum+1]; - unsigned int const *index_front_start[IndexNum+1]; - unsigned int const *index_front_end[IndexNum+1]; - NumericT const * value_front_start[IndexNum+1]; - NumericT values_A[IndexNum+1]; - - // Set up pointers for loading the indices: - for (unsigned int i=0; i<IndexNum; ++i, ++row_indices_B) - { - unsigned int row_B = *row_indices_B; - - index_front_start[i] = B_col_buffer + B_row_buffer[row_B]; - index_front_end[i] = B_col_buffer + B_row_buffer[row_B + 1]; - value_front_start[i] = B_elements + B_row_buffer[row_B]; - values_A[i] = val_A[i]; - } - index_front_start[IndexNum] = row_C_vector_input; - index_front_end[IndexNum] = row_C_vector_input_end; - value_front_start[IndexNum] = row_C_vector_input_values; - values_A[IndexNum] = NumericT(1); - - // load indices: - for (unsigned int i=0; i<=IndexNum; ++i) - index_front[i] = (index_front_start[i] < index_front_end[i]) ? *index_front_start[i] : B_size2; - - unsigned int *output_ptr = row_C_vector_output; - - while (1) - { - // get minimum index in current front: - unsigned int min_index_in_front = B_size2; - for (unsigned int i=0; i<=IndexNum; ++i) - min_index_in_front = std::min(min_index_in_front, index_front[i]); - - if (min_index_in_front == B_size2) // we're done - break; - - // advance index front where equal to minimum index: - NumericT row_C_value = 0; - for (unsigned int i=0; i<=IndexNum; ++i) - { - if (index_front[i] == min_index_in_front) - { - index_front_start[i] += 1; - index_front[i] = (index_front_start[i] < index_front_end[i]) ? *index_front_start[i] : B_size2; - - row_C_value += values_A[i] * *value_front_start[i]; - value_front_start[i] += 1; - } - } - - // write current entry: - *output_ptr = min_index_in_front; - ++output_ptr; - *row_C_vector_output_values = row_C_value; - ++row_C_vector_output_values; - } - - return static_cast<unsigned int>(output_ptr - row_C_vector_output); -} - - - -#ifdef VIENNACL_WITH_AVX2 -inline -unsigned int row_C_scan_numeric_vector_AVX2(int const *row_indices_B_begin, int const *row_indices_B_end, double const *values_A, - int const *B_row_buffer, int const *B_col_buffer, double const *B_elements, - int B_size2, - int *row_C_vector_output, double *row_C_vector_output_values) -{ - __m256i avx_all_ones = _mm256_set_epi32(1, 1, 1, 1, 1, 1, 1, 1); - __m256i avx_all_bsize2 = _mm256_set_epi32(B_size2, B_size2, B_size2, B_size2, B_size2, B_size2, B_size2, B_size2); - - __m256i avx_row_indices_offsets = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0); - __m256i avx_load_mask = _mm256_sub_epi32(avx_row_indices_offsets, _mm256_set1_epi32(row_indices_B_end - row_indices_B_begin)); - __m256i avx_load_mask2 = avx_load_mask; - - __m256i avx_row_indices = _mm256_set1_epi32(0); - avx_row_indices = _mm256_mask_i32gather_epi32(avx_row_indices, row_indices_B_begin, avx_row_indices_offsets, avx_load_mask, 4); - - // load values from A: - avx_load_mask = avx_load_mask2; // reload mask (destroyed by gather) - __m256d avx_value_A_low = _mm256_mask_i32gather_pd(_mm256_set_pd(0, 0, 0, 0), //src - values_A, //base ptr - _mm256_extractf128_si256(avx_row_indices_offsets, 0), //indices - _mm256_permutevar8x32_epi32(avx_load_mask, _mm256_set_epi32(3, 7, 2, 6, 1, 5, 0, 4)), 8); // mask - avx_load_mask = avx_load_mask2; // reload mask (destroyed by gather) - __m256d avx_value_A_high = _mm256_mask_i32gather_pd(_mm256_set_pd(0, 0, 0, 0), //src - values_A, //base ptr - _mm256_extractf128_si256(avx_row_indices_offsets, 1), //indices - _mm256_permutevar8x32_epi32(avx_load_mask, _mm256_set_epi32(7, 3, 6, 2, 5, 1, 4, 0)), 8); // mask - - - avx_load_mask = avx_load_mask2; // reload mask (destroyed by gather) - __m256i avx_row_start = _mm256_mask_i32gather_epi32(avx_all_ones, B_row_buffer, avx_row_indices, avx_load_mask, 4); - avx_load_mask = avx_load_mask2; // reload mask (destroyed by gather) - __m256i avx_row_end = _mm256_mask_i32gather_epi32(avx_all_ones, B_row_buffer+1, avx_row_indices, avx_load_mask, 4); - - avx_load_mask = _mm256_cmpgt_epi32(avx_row_end, avx_row_start); - avx_load_mask2 = avx_load_mask; - __m256i avx_index_front = avx_all_bsize2; - avx_index_front = _mm256_mask_i32gather_epi32(avx_index_front, B_col_buffer, avx_row_start, avx_load_mask, 4); - - // load front values from B: - avx_load_mask = avx_load_mask2; // reload mask (destroyed by gather) - __m256d avx_value_front_low = _mm256_mask_i32gather_pd(_mm256_set_pd(0, 0, 0, 0), //src - B_elements, //base ptr - _mm256_extractf128_si256(avx_row_start, 0), //indices - _mm256_permutevar8x32_epi32(avx_load_mask, _mm256_set_epi32(3, 7, 2, 6, 1, 5, 0, 4)), 8); // mask - avx_load_mask = avx_load_mask2; // reload mask (destroyed by gather) - __m256d avx_value_front_high = _mm256_mask_i32gather_pd(_mm256_set_pd(0, 0, 0, 0), //src - B_elements, //base ptr - _mm256_extractf128_si256(avx_row_start, 1), //indices - _mm256_permutevar8x32_epi32(avx_load_mask, _mm256_set_epi32(7, 3, 6, 2, 5, 1, 4, 0)), 8); // mask - - int *output_ptr = row_C_vector_output; - - while (1) - { - // get minimum index in current front: - __m256i avx_index_min1 = avx_index_front; - __m256i avx_temp = _mm256_permutevar8x32_epi32(avx_index_min1, _mm256_set_epi32(3, 2, 1, 0, 7, 6, 5, 4)); - avx_index_min1 = _mm256_min_epi32(avx_index_min1, avx_temp); // first four elements compared against last four elements - - avx_temp = _mm256_shuffle_epi32(avx_index_min1, int(78)); // 0b01001110 = 78, using shuffle instead of permutevar here because of lower latency - avx_index_min1 = _mm256_min_epi32(avx_index_min1, avx_temp); // first two elements compared against elements three and four (same for upper half of register) - - avx_temp = _mm256_shuffle_epi32(avx_index_min1, int(177)); // 0b10110001 = 177, using shuffle instead of permutevar here because of lower latency - avx_index_min1 = _mm256_min_epi32(avx_index_min1, avx_temp); // now all entries of avx_index_min1 hold the minimum - - int min_index_in_front = ((int*)&avx_index_min1)[0]; - // check for end of merge operation: - if (min_index_in_front == B_size2) - break; - - // accumulate value (can certainly be done more elegantly...) - double value = 0; - value += (min_index_in_front == ((int*)&avx_index_front)[0]) ? ((double*)&avx_value_front_low)[0] * ((double*)&avx_value_A_low)[0] : 0; - value += (min_index_in_front == ((int*)&avx_index_front)[1]) ? ((double*)&avx_value_front_low)[1] * ((double*)&avx_value_A_low)[1] : 0; - value += (min_index_in_front == ((int*)&avx_index_front)[2]) ? ((double*)&avx_value_front_low)[2] * ((double*)&avx_value_A_low)[2] : 0; - value += (min_index_in_front == ((int*)&avx_index_front)[3]) ? ((double*)&avx_value_front_low)[3] * ((double*)&avx_value_A_low)[3] : 0; - value += (min_index_in_front == ((int*)&avx_index_front)[4]) ? ((double*)&avx_value_front_high)[0] * ((double*)&avx_value_A_high)[0] : 0; - value += (min_index_in_front == ((int*)&avx_index_front)[5]) ? ((double*)&avx_value_front_high)[1] * ((double*)&avx_value_A_high)[1] : 0; - value += (min_index_in_front == ((int*)&avx_index_front)[6]) ? ((double*)&avx_value_front_high)[2] * ((double*)&avx_value_A_high)[2] : 0; - value += (min_index_in_front == ((int*)&avx_index_front)[7]) ? ((double*)&avx_value_front_high)[3] * ((double*)&avx_value_A_high)[3] : 0; - *row_C_vector_output_values = value; - ++row_C_vector_output_values; - - // write current entry: - *output_ptr = min_index_in_front; - ++output_ptr; - - // advance index front where equal to minimum index: - avx_load_mask = _mm256_cmpeq_epi32(avx_index_front, avx_index_min1); - // first part: set index to B_size2 if equal to minimum index: - avx_temp = _mm256_and_si256(avx_all_bsize2, avx_load_mask); - avx_index_front = _mm256_max_epi32(avx_index_front, avx_temp); - // second part: increment row_start registers where minimum found: - avx_temp = _mm256_and_si256(avx_all_ones, avx_load_mask); //ones only where the minimum was found - avx_row_start = _mm256_add_epi32(avx_row_start, avx_temp); - // third part part: load new data where more entries available: - avx_load_mask = _mm256_cmpgt_epi32(avx_row_end, avx_row_start); - avx_load_mask2 = avx_load_mask; - avx_index_front = _mm256_mask_i32gather_epi32(avx_index_front, B_col_buffer, avx_row_start, avx_load_mask, 4); - - // load new values where necessary: - avx_load_mask = avx_load_mask2; // reload mask (destroyed by gather) - avx_value_front_low = _mm256_mask_i32gather_pd(avx_value_front_low, //src - B_elements, //base ptr - _mm256_extractf128_si256(avx_row_start, 0), //indices - _mm256_permutevar8x32_epi32(avx_load_mask, _mm256_set_epi32(3, 7, 2, 6, 1, 5, 0, 4)), 8); // mask - - avx_load_mask = avx_load_mask2; // reload mask (destroyed by gather) - avx_value_front_high = _mm256_mask_i32gather_pd(avx_value_front_high, //src - B_elements, //base ptr - _mm256_extractf128_si256(avx_row_start, 1), //indices - _mm256_permutevar8x32_epi32(avx_load_mask, _mm256_set_epi32(7, 3, 6, 2, 5, 1, 4, 0)), 8); // mask - - //multiply new entries: - - } - - return static_cast<unsigned int>(output_ptr - row_C_vector_output); -} -#endif - - -template<typename NumericT> -unsigned int row_C_scan_numeric_vector_1(unsigned int const *input1_index_begin, unsigned int const *input1_index_end, NumericT const *input1_values_begin, NumericT factor1, - unsigned int const *input2_index_begin, unsigned int const *input2_index_end, NumericT const *input2_values_begin, NumericT factor2, - unsigned int termination_index, - unsigned int *output_index_begin, NumericT *output_values_begin) -{ - unsigned int *output_ptr = output_index_begin; - - unsigned int index1 = (input1_index_begin < input1_index_end) ? *input1_index_begin : termination_index; - unsigned int index2 = (input2_index_begin < input2_index_end) ? *input2_index_begin : termination_index; - - while (1) - { - unsigned int min_index = std::min(index1, index2); - NumericT value = 0; - - if (min_index == termination_index) - break; - - if (min_index == index1) - { - ++input1_index_begin; - index1 = (input1_index_begin < input1_index_end) ? *input1_index_begin : termination_index; - - value += factor1 * *input1_values_begin; - ++input1_values_begin; - } - - if (min_index == index2) - { - ++input2_index_begin; - index2 = (input2_index_begin < input2_index_end) ? *input2_index_begin : termination_index; - - value += factor2 * *input2_values_begin; - ++input2_values_begin; - } - - // write current entry: - *output_ptr = min_index; - ++output_ptr; - *output_values_begin = value; - ++output_values_begin; - } - - return static_cast<unsigned int>(output_ptr - output_index_begin); -} - -template<typename NumericT> -void row_C_scan_numeric_vector(unsigned int row_start_A, unsigned int row_end_A, unsigned int const *A_col_buffer, NumericT const *A_elements, - unsigned int const *B_row_buffer, unsigned int const *B_col_buffer, NumericT const *B_elements, unsigned int B_size2, - unsigned int row_start_C, unsigned int row_end_C, unsigned int *C_col_buffer, NumericT *C_elements, - unsigned int *row_C_vector_1, NumericT *row_C_vector_1_values, - unsigned int *row_C_vector_2, NumericT *row_C_vector_2_values, - unsigned int *row_C_vector_3, NumericT *row_C_vector_3_values) -{ - (void)row_end_C; - - // Trivial case: row length 0: - if (row_start_A == row_end_A) - return; - - // Trivial case: row length 1: - if (row_end_A - row_start_A == 1) - { - unsigned int A_col = A_col_buffer[row_start_A]; - unsigned int B_end = B_row_buffer[A_col + 1]; - NumericT A_value = A_elements[row_start_A]; - C_col_buffer += row_start_C; - C_elements += row_start_C; - for (unsigned int j = B_row_buffer[A_col]; j < B_end; ++j, ++C_col_buffer, ++C_elements) - { - *C_col_buffer = B_col_buffer[j]; - *C_elements = A_value * B_elements[j]; - } - return; - } - - unsigned int row_C_len = 0; - if (row_end_A - row_start_A == 2) // directly merge to C: - { - unsigned int A_col_1 = A_col_buffer[row_start_A]; - unsigned int A_col_2 = A_col_buffer[row_start_A + 1]; - - unsigned int B_offset_1 = B_row_buffer[A_col_1]; - unsigned int B_offset_2 = B_row_buffer[A_col_2]; - - row_C_scan_numeric_vector_1(B_col_buffer + B_offset_1, B_col_buffer + B_row_buffer[A_col_1+1], B_elements + B_offset_1, A_elements[row_start_A], - B_col_buffer + B_offset_2, B_col_buffer + B_row_buffer[A_col_2+1], B_elements + B_offset_2, A_elements[row_start_A + 1], - B_size2, - C_col_buffer + row_start_C, C_elements + row_start_C); - return; - } -#ifdef VIENNACL_WITH_AVX2 - else if (row_end_A - row_start_A > 10) // safely merge eight rows into temporary buffer: - { - row_C_len = row_C_scan_numeric_vector_AVX2((const int*)(A_col_buffer + row_start_A), (const int*)(A_col_buffer + row_end_A), A_elements + row_start_A, - (const int*)B_row_buffer, (const int*)B_col_buffer, B_elements, int(B_size2), - (int*)row_C_vector_1, row_C_vector_1_values); - row_start_A += 8; - } -#endif - else // safely merge two rows into temporary buffer: - { - unsigned int A_col_1 = A_col_buffer[row_start_A]; - unsigned int A_col_2 = A_col_buffer[row_start_A + 1]; - - unsigned int B_offset_1 = B_row_buffer[A_col_1]; - unsigned int B_offset_2 = B_row_buffer[A_col_2]; - - row_C_len = row_C_scan_numeric_vector_1(B_col_buffer + B_offset_1, B_col_buffer + B_row_buffer[A_col_1+1], B_elements + B_offset_1, A_elements[row_start_A], - B_col_buffer + B_offset_2, B_col_buffer + B_row_buffer[A_col_2+1], B_elements + B_offset_2, A_elements[row_start_A + 1], - B_size2, - row_C_vector_1, row_C_vector_1_values); - row_start_A += 2; - } - - // process remaining rows: - while (row_end_A > row_start_A) - { -#ifdef VIENNACL_WITH_AVX2 - if (row_end_A - row_start_A > 9) // code in other if-conditionals ensures that values get written to C - { - unsigned int merged_len = row_C_scan_numeric_vector_AVX2((const int*)(A_col_buffer + row_start_A), (const int*)(A_col_buffer + row_end_A), A_elements + row_start_A, - (const int*)B_row_buffer, (const int*)B_col_buffer, B_elements, int(B_size2), - (int*)row_C_vector_3, row_C_vector_3_values); - row_C_len = row_C_scan_numeric_vector_1(row_C_vector_3, row_C_vector_3 + merged_len, row_C_vector_3_values, NumericT(1.0), - row_C_vector_1, row_C_vector_1 + row_C_len, row_C_vector_1_values, NumericT(1.0), - B_size2, - row_C_vector_2, row_C_vector_2_values); - row_start_A += 8; - } - else -#endif - if (row_start_A + 1 == row_end_A) // last row to merge, write directly to C: - { - unsigned int A_col = A_col_buffer[row_start_A]; - unsigned int B_offset = B_row_buffer[A_col]; - - row_C_len = row_C_scan_numeric_vector_1(B_col_buffer + B_offset, B_col_buffer + B_row_buffer[A_col+1], B_elements + B_offset, A_elements[row_start_A], - row_C_vector_1, row_C_vector_1 + row_C_len, row_C_vector_1_values, NumericT(1.0), - B_size2, - C_col_buffer + row_start_C, C_elements + row_start_C); - return; - } - else if (row_start_A + 2 < row_end_A)// at least three more rows left, so merge two - { - // process single row: - unsigned int A_col_1 = A_col_buffer[row_start_A]; - unsigned int A_col_2 = A_col_buffer[row_start_A + 1]; - - unsigned int B_offset_1 = B_row_buffer[A_col_1]; - unsigned int B_offset_2 = B_row_buffer[A_col_2]; - - unsigned int merged_len = row_C_scan_numeric_vector_1(B_col_buffer + B_offset_1, B_col_buffer + B_row_buffer[A_col_1+1], B_elements + B_offset_1, A_elements[row_start_A], - B_col_buffer + B_offset_2, B_col_buffer + B_row_buffer[A_col_2+1], B_elements + B_offset_2, A_elements[row_start_A + 1], - B_size2, - row_C_vector_3, row_C_vector_3_values); - row_C_len = row_C_scan_numeric_vector_1(row_C_vector_3, row_C_vector_3 + merged_len, row_C_vector_3_values, NumericT(1.0), - row_C_vector_1, row_C_vector_1 + row_C_len, row_C_vector_1_values, NumericT(1.0), - B_size2, - row_C_vector_2, row_C_vector_2_values); - row_start_A += 2; - } - else - { - unsigned int A_col = A_col_buffer[row_start_A]; - unsigned int B_offset = B_row_buffer[A_col]; - - row_C_len = row_C_scan_numeric_vector_1(B_col_buffer + B_offset, B_col_buffer + B_row_buffer[A_col+1], B_elements + B_offset, A_elements[row_start_A], - row_C_vector_1, row_C_vector_1 + row_C_len, row_C_vector_1_values, NumericT(1.0), - B_size2, - row_C_vector_2, row_C_vector_2_values); - ++row_start_A; - } - - std::swap(row_C_vector_1, row_C_vector_2); - std::swap(row_C_vector_1_values, row_C_vector_2_values); - } -} - - -} // namespace host_based -} //namespace linalg -} //namespace viennacl - - -#endif
http://git-wip-us.apache.org/repos/asf/mahout/blob/7ae549fa/native-viennaCL/src/main/cpp/viennacl/linalg/host_based/vector_operations.hpp ---------------------------------------------------------------------- diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/host_based/vector_operations.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/host_based/vector_operations.hpp deleted file mode 100644 index b4944a2..0000000 --- a/native-viennaCL/src/main/cpp/viennacl/linalg/host_based/vector_operations.hpp +++ /dev/null @@ -1,1188 +0,0 @@ -#ifndef VIENNACL_LINALG_HOST_BASED_VECTOR_OPERATIONS_HPP_ -#define VIENNACL_LINALG_HOST_BASED_VECTOR_OPERATIONS_HPP_ - -/* ========================================================================= - Copyright (c) 2010-2016, Institute for Microelectronics, - Institute for Analysis and Scientific Computing, - TU Wien. - Portions of this software are copyright by UChicago Argonne, LLC. - - ----------------- - ViennaCL - The Vienna Computing Library - ----------------- - - Project Head: Karl Rupp [email protected] - - (A list of authors and contributors can be found in the manual) - - License: MIT (X11), see file LICENSE in the base directory -============================================================================= */ - -/** @file viennacl/linalg/host_based/vector_operations.hpp - @brief Implementations of vector operations using a plain single-threaded or OpenMP-enabled execution on CPU -*/ - -#include <cmath> -#include <algorithm> //for std::max and std::min - -#include "viennacl/forwards.h" -#include "viennacl/scalar.hpp" -#include "viennacl/tools/tools.hpp" -#include "viennacl/meta/predicate.hpp" -#include "viennacl/meta/enable_if.hpp" -#include "viennacl/traits/size.hpp" -#include "viennacl/traits/start.hpp" -#include "viennacl/linalg/host_based/common.hpp" -#include "viennacl/linalg/detail/op_applier.hpp" -#include "viennacl/traits/stride.hpp" - -#ifdef VIENNACL_WITH_OPENMP -#include <omp.h> -#endif - -// Minimum vector size for using OpenMP on vector operations: -#ifndef VIENNACL_OPENMP_VECTOR_MIN_SIZE - #define VIENNACL_OPENMP_VECTOR_MIN_SIZE 5000 -#endif - -namespace viennacl -{ -namespace linalg -{ -namespace host_based -{ -namespace detail -{ - template<typename NumericT> - NumericT flip_sign(NumericT val) { return -val; } - inline unsigned long flip_sign(unsigned long val) { return val; } - inline unsigned int flip_sign(unsigned int val) { return val; } - inline unsigned short flip_sign(unsigned short val) { return val; } - inline unsigned char flip_sign(unsigned char val) { return val; } -} - -// -// Introductory note: By convention, all dimensions are already checked in the dispatcher frontend. No need to double-check again in here! -// -template<typename DestNumericT, typename SrcNumericT> -void convert(vector_base<DestNumericT> & dest, vector_base<SrcNumericT> const & src) -{ - DestNumericT * data_dest = detail::extract_raw_pointer<DestNumericT>(dest); - SrcNumericT const * data_src = detail::extract_raw_pointer<SrcNumericT>(src); - - vcl_size_t start_dest = viennacl::traits::start(dest); - vcl_size_t inc_dest = viennacl::traits::stride(dest); - vcl_size_t size_dest = viennacl::traits::size(dest); - - vcl_size_t start_src = viennacl::traits::start(src); - vcl_size_t inc_src = viennacl::traits::stride(src); - -#ifdef VIENNACL_WITH_OPENMP - #pragma omp parallel for if (size_dest > VIENNACL_OPENMP_VECTOR_MIN_SIZE) -#endif - for (long i = 0; i < static_cast<long>(size_dest); ++i) - data_dest[static_cast<vcl_size_t>(i)*inc_dest+start_dest] = static_cast<DestNumericT>(data_src[static_cast<vcl_size_t>(i)*inc_src+start_src]); -} - -template<typename NumericT, typename ScalarT1> -void av(vector_base<NumericT> & vec1, - vector_base<NumericT> const & vec2, ScalarT1 const & alpha, vcl_size_t /*len_alpha*/, bool reciprocal_alpha, bool flip_sign_alpha) -{ - typedef NumericT value_type; - - value_type * data_vec1 = detail::extract_raw_pointer<value_type>(vec1); - value_type const * data_vec2 = detail::extract_raw_pointer<value_type>(vec2); - - value_type data_alpha = alpha; - if (flip_sign_alpha) - data_alpha = detail::flip_sign(data_alpha); - - vcl_size_t start1 = viennacl::traits::start(vec1); - vcl_size_t inc1 = viennacl::traits::stride(vec1); - vcl_size_t size1 = viennacl::traits::size(vec1); - - vcl_size_t start2 = viennacl::traits::start(vec2); - vcl_size_t inc2 = viennacl::traits::stride(vec2); - - if (reciprocal_alpha) - { -#ifdef VIENNACL_WITH_OPENMP - #pragma omp parallel for if (size1 > VIENNACL_OPENMP_VECTOR_MIN_SIZE) -#endif - for (long i = 0; i < static_cast<long>(size1); ++i) - data_vec1[static_cast<vcl_size_t>(i)*inc1+start1] = data_vec2[static_cast<vcl_size_t>(i)*inc2+start2] / data_alpha; - } - else - { -#ifdef VIENNACL_WITH_OPENMP - #pragma omp parallel for if (size1 > VIENNACL_OPENMP_VECTOR_MIN_SIZE) -#endif - for (long i = 0; i < static_cast<long>(size1); ++i) - data_vec1[static_cast<vcl_size_t>(i)*inc1+start1] = data_vec2[static_cast<vcl_size_t>(i)*inc2+start2] * data_alpha; - } -} - - -template<typename NumericT, typename ScalarT1, typename ScalarT2> -void avbv(vector_base<NumericT> & vec1, - vector_base<NumericT> const & vec2, ScalarT1 const & alpha, vcl_size_t /* len_alpha */, bool reciprocal_alpha, bool flip_sign_alpha, - vector_base<NumericT> const & vec3, ScalarT2 const & beta, vcl_size_t /* len_beta */, bool reciprocal_beta, bool flip_sign_beta) -{ - typedef NumericT value_type; - - value_type * data_vec1 = detail::extract_raw_pointer<value_type>(vec1); - value_type const * data_vec2 = detail::extract_raw_pointer<value_type>(vec2); - value_type const * data_vec3 = detail::extract_raw_pointer<value_type>(vec3); - - value_type data_alpha = alpha; - if (flip_sign_alpha) - data_alpha = detail::flip_sign(data_alpha); - - value_type data_beta = beta; - if (flip_sign_beta) - data_beta = detail::flip_sign(data_beta); - - vcl_size_t start1 = viennacl::traits::start(vec1); - vcl_size_t inc1 = viennacl::traits::stride(vec1); - vcl_size_t size1 = viennacl::traits::size(vec1); - - vcl_size_t start2 = viennacl::traits::start(vec2); - vcl_size_t inc2 = viennacl::traits::stride(vec2); - - vcl_size_t start3 = viennacl::traits::start(vec3); - vcl_size_t inc3 = viennacl::traits::stride(vec3); - - if (reciprocal_alpha) - { - if (reciprocal_beta) - { -#ifdef VIENNACL_WITH_OPENMP - #pragma omp parallel for if (size1 > VIENNACL_OPENMP_VECTOR_MIN_SIZE) -#endif - for (long i = 0; i < static_cast<long>(size1); ++i) - data_vec1[static_cast<vcl_size_t>(i)*inc1+start1] = data_vec2[static_cast<vcl_size_t>(i)*inc2+start2] / data_alpha + data_vec3[static_cast<vcl_size_t>(i)*inc3+start3] / data_beta; - } - else - { -#ifdef VIENNACL_WITH_OPENMP - #pragma omp parallel for if (size1 > VIENNACL_OPENMP_VECTOR_MIN_SIZE) -#endif - for (long i = 0; i < static_cast<long>(size1); ++i) - data_vec1[static_cast<vcl_size_t>(i)*inc1+start1] = data_vec2[static_cast<vcl_size_t>(i)*inc2+start2] / data_alpha + data_vec3[static_cast<vcl_size_t>(i)*inc3+start3] * data_beta; - } - } - else - { - if (reciprocal_beta) - { -#ifdef VIENNACL_WITH_OPENMP - #pragma omp parallel for if (size1 > VIENNACL_OPENMP_VECTOR_MIN_SIZE) -#endif - for (long i = 0; i < static_cast<long>(size1); ++i) - data_vec1[static_cast<vcl_size_t>(i)*inc1+start1] = data_vec2[static_cast<vcl_size_t>(i)*inc2+start2] * data_alpha + data_vec3[static_cast<vcl_size_t>(i)*inc3+start3] / data_beta; - } - else - { -#ifdef VIENNACL_WITH_OPENMP - #pragma omp parallel for if (size1 > VIENNACL_OPENMP_VECTOR_MIN_SIZE) -#endif - for (long i = 0; i < static_cast<long>(size1); ++i) - data_vec1[static_cast<vcl_size_t>(i)*inc1+start1] = data_vec2[static_cast<vcl_size_t>(i)*inc2+start2] * data_alpha + data_vec3[static_cast<vcl_size_t>(i)*inc3+start3] * data_beta; - } - } -} - - -template<typename NumericT, typename ScalarT1, typename ScalarT2> -void avbv_v(vector_base<NumericT> & vec1, - vector_base<NumericT> const & vec2, ScalarT1 const & alpha, vcl_size_t /*len_alpha*/, bool reciprocal_alpha, bool flip_sign_alpha, - vector_base<NumericT> const & vec3, ScalarT2 const & beta, vcl_size_t /*len_beta*/, bool reciprocal_beta, bool flip_sign_beta) -{ - typedef NumericT value_type; - - value_type * data_vec1 = detail::extract_raw_pointer<value_type>(vec1); - value_type const * data_vec2 = detail::extract_raw_pointer<value_type>(vec2); - value_type const * data_vec3 = detail::extract_raw_pointer<value_type>(vec3); - - value_type data_alpha = alpha; - if (flip_sign_alpha) - data_alpha = detail::flip_sign(data_alpha); - - value_type data_beta = beta; - if (flip_sign_beta) - data_beta = detail::flip_sign(data_beta); - - vcl_size_t start1 = viennacl::traits::start(vec1); - vcl_size_t inc1 = viennacl::traits::stride(vec1); - vcl_size_t size1 = viennacl::traits::size(vec1); - - vcl_size_t start2 = viennacl::traits::start(vec2); - vcl_size_t inc2 = viennacl::traits::stride(vec2); - - vcl_size_t start3 = viennacl::traits::start(vec3); - vcl_size_t inc3 = viennacl::traits::stride(vec3); - - if (reciprocal_alpha) - { - if (reciprocal_beta) - { -#ifdef VIENNACL_WITH_OPENMP - #pragma omp parallel for if (size1 > VIENNACL_OPENMP_VECTOR_MIN_SIZE) -#endif - for (long i = 0; i < static_cast<long>(size1); ++i) - data_vec1[static_cast<vcl_size_t>(i)*inc1+start1] += data_vec2[static_cast<vcl_size_t>(i)*inc2+start2] / data_alpha + data_vec3[static_cast<vcl_size_t>(i)*inc3+start3] / data_beta; - } - else - { -#ifdef VIENNACL_WITH_OPENMP - #pragma omp parallel for if (size1 > VIENNACL_OPENMP_VECTOR_MIN_SIZE) -#endif - for (long i = 0; i < static_cast<long>(size1); ++i) - data_vec1[static_cast<vcl_size_t>(i)*inc1+start1] += data_vec2[static_cast<vcl_size_t>(i)*inc2+start2] / data_alpha + data_vec3[static_cast<vcl_size_t>(i)*inc3+start3] * data_beta; - } - } - else - { - if (reciprocal_beta) - { -#ifdef VIENNACL_WITH_OPENMP - #pragma omp parallel for if (size1 > VIENNACL_OPENMP_VECTOR_MIN_SIZE) -#endif - for (long i = 0; i < static_cast<long>(size1); ++i) - data_vec1[static_cast<vcl_size_t>(i)*inc1+start1] += data_vec2[static_cast<vcl_size_t>(i)*inc2+start2] * data_alpha + data_vec3[static_cast<vcl_size_t>(i)*inc3+start3] / data_beta; - } - else - { -#ifdef VIENNACL_WITH_OPENMP - #pragma omp parallel for if (size1 > VIENNACL_OPENMP_VECTOR_MIN_SIZE) -#endif - for (long i = 0; i < static_cast<long>(size1); ++i) - data_vec1[static_cast<vcl_size_t>(i)*inc1+start1] += data_vec2[static_cast<vcl_size_t>(i)*inc2+start2] * data_alpha + data_vec3[static_cast<vcl_size_t>(i)*inc3+start3] * data_beta; - } - } -} - - - - -/** @brief Assign a constant value to a vector (-range/-slice) -* -* @param vec1 The vector to which the value should be assigned -* @param alpha The value to be assigned -* @param up_to_internal_size Specifies whether alpha should also be written to padded memory (mostly used for clearing the whole buffer). -*/ -template<typename NumericT> -void vector_assign(vector_base<NumericT> & vec1, const NumericT & alpha, bool up_to_internal_size = false) -{ - typedef NumericT value_type; - - value_type * data_vec1 = detail::extract_raw_pointer<value_type>(vec1); - - vcl_size_t start1 = viennacl::traits::start(vec1); - vcl_size_t inc1 = viennacl::traits::stride(vec1); - vcl_size_t size1 = viennacl::traits::size(vec1); - vcl_size_t loop_bound = up_to_internal_size ? vec1.internal_size() : size1; //Note: Do NOT use traits::internal_size() here, because vector proxies don't require padding. - - value_type data_alpha = static_cast<value_type>(alpha); - -#ifdef VIENNACL_WITH_OPENMP - #pragma omp parallel for if (loop_bound > VIENNACL_OPENMP_VECTOR_MIN_SIZE) -#endif - for (long i = 0; i < static_cast<long>(loop_bound); ++i) - data_vec1[static_cast<vcl_size_t>(i)*inc1+start1] = data_alpha; -} - - -/** @brief Swaps the contents of two vectors, data is copied -* -* @param vec1 The first vector (or -range, or -slice) -* @param vec2 The second vector (or -range, or -slice) -*/ -template<typename NumericT> -void vector_swap(vector_base<NumericT> & vec1, vector_base<NumericT> & vec2) -{ - typedef NumericT value_type; - - value_type * data_vec1 = detail::extract_raw_pointer<value_type>(vec1); - value_type * data_vec2 = detail::extract_raw_pointer<value_type>(vec2); - - vcl_size_t start1 = viennacl::traits::start(vec1); - vcl_size_t inc1 = viennacl::traits::stride(vec1); - vcl_size_t size1 = viennacl::traits::size(vec1); - - vcl_size_t start2 = viennacl::traits::start(vec2); - vcl_size_t inc2 = viennacl::traits::stride(vec2); - -#ifdef VIENNACL_WITH_OPENMP - #pragma omp parallel for if (size1 > VIENNACL_OPENMP_VECTOR_MIN_SIZE) -#endif - for (long i = 0; i < static_cast<long>(size1); ++i) - { - value_type temp = data_vec2[static_cast<vcl_size_t>(i)*inc2+start2]; - data_vec2[static_cast<vcl_size_t>(i)*inc2+start2] = data_vec1[static_cast<vcl_size_t>(i)*inc1+start1]; - data_vec1[static_cast<vcl_size_t>(i)*inc1+start1] = temp; - } -} - - -///////////////////////// Elementwise operations ///////////// - -/** @brief Implementation of the element-wise operation v1 = v2 .* v3 and v1 = v2 ./ v3 (using MATLAB syntax) -* -* @param vec1 The result vector (or -range, or -slice) -* @param proxy The proxy object holding v2, v3 and the operation -*/ -template<typename NumericT, typename OpT> -void element_op(vector_base<NumericT> & vec1, - vector_expression<const vector_base<NumericT>, const vector_base<NumericT>, op_element_binary<OpT> > const & proxy) -{ - typedef NumericT value_type; - typedef viennacl::linalg::detail::op_applier<op_element_binary<OpT> > OpFunctor; - - value_type * data_vec1 = detail::extract_raw_pointer<value_type>(vec1); - value_type const * data_vec2 = detail::extract_raw_pointer<value_type>(proxy.lhs()); - value_type const * data_vec3 = detail::extract_raw_pointer<value_type>(proxy.rhs()); - - vcl_size_t start1 = viennacl::traits::start(vec1); - vcl_size_t inc1 = viennacl::traits::stride(vec1); - vcl_size_t size1 = viennacl::traits::size(vec1); - - vcl_size_t start2 = viennacl::traits::start(proxy.lhs()); - vcl_size_t inc2 = viennacl::traits::stride(proxy.lhs()); - - vcl_size_t start3 = viennacl::traits::start(proxy.rhs()); - vcl_size_t inc3 = viennacl::traits::stride(proxy.rhs()); - -#ifdef VIENNACL_WITH_OPENMP - #pragma omp parallel for if (size1 > VIENNACL_OPENMP_VECTOR_MIN_SIZE) -#endif - for (long i = 0; i < static_cast<long>(size1); ++i) - OpFunctor::apply(data_vec1[static_cast<vcl_size_t>(i)*inc1+start1], data_vec2[static_cast<vcl_size_t>(i)*inc2+start2], data_vec3[static_cast<vcl_size_t>(i)*inc3+start3]); -} - -/** @brief Implementation of the element-wise operation v1 = v2 .* v3 and v1 = v2 ./ v3 (using MATLAB syntax) -* -* @param vec1 The result vector (or -range, or -slice) -* @param proxy The proxy object holding v2, v3 and the operation -*/ -template<typename NumericT, typename OpT> -void element_op(vector_base<NumericT> & vec1, - vector_expression<const vector_base<NumericT>, const vector_base<NumericT>, op_element_unary<OpT> > const & proxy) -{ - typedef NumericT value_type; - typedef viennacl::linalg::detail::op_applier<op_element_unary<OpT> > OpFunctor; - - value_type * data_vec1 = detail::extract_raw_pointer<value_type>(vec1); - value_type const * data_vec2 = detail::extract_raw_pointer<value_type>(proxy.lhs()); - - vcl_size_t start1 = viennacl::traits::start(vec1); - vcl_size_t inc1 = viennacl::traits::stride(vec1); - vcl_size_t size1 = viennacl::traits::size(vec1); - - vcl_size_t start2 = viennacl::traits::start(proxy.lhs()); - vcl_size_t inc2 = viennacl::traits::stride(proxy.lhs()); - -#ifdef VIENNACL_WITH_OPENMP - #pragma omp parallel for if (size1 > VIENNACL_OPENMP_VECTOR_MIN_SIZE) -#endif - for (long i = 0; i < static_cast<long>(size1); ++i) - OpFunctor::apply(data_vec1[static_cast<vcl_size_t>(i)*inc1+start1], data_vec2[static_cast<vcl_size_t>(i)*inc2+start2]); -} - - -///////////////////////// Norms and inner product /////////////////// - - -//implementation of inner product: - -namespace detail -{ - -// the following circumvents problems when trying to use a variable of template parameter type for a reduction. -// Such a behavior is not covered by the OpenMP standard, hence we manually apply some preprocessor magic to resolve the problem. -// See https://github.com/viennacl/viennacl-dev/issues/112 for a detailed explanation and discussion. - -#define VIENNACL_INNER_PROD_IMPL_1(RESULTSCALART, TEMPSCALART) \ - inline RESULTSCALART inner_prod_impl(RESULTSCALART const * data_vec1, vcl_size_t start1, vcl_size_t inc1, vcl_size_t size1, \ - RESULTSCALART const * data_vec2, vcl_size_t start2, vcl_size_t inc2) { \ - TEMPSCALART temp = 0; - -#define VIENNACL_INNER_PROD_IMPL_2(RESULTSCALART) \ - for (long i = 0; i < static_cast<long>(size1); ++i) \ - temp += data_vec1[static_cast<vcl_size_t>(i)*inc1+start1] * data_vec2[static_cast<vcl_size_t>(i)*inc2+start2]; \ - return static_cast<RESULTSCALART>(temp); \ - } - -// char -VIENNACL_INNER_PROD_IMPL_1(char, int) -#ifdef VIENNACL_WITH_OPENMP - #pragma omp parallel for reduction(+: temp) if (size1 > VIENNACL_OPENMP_VECTOR_MIN_SIZE) -#endif -VIENNACL_INNER_PROD_IMPL_2(char) - -VIENNACL_INNER_PROD_IMPL_1(unsigned char, int) -#ifdef VIENNACL_WITH_OPENMP - #pragma omp parallel for reduction(+: temp) if (size1 > VIENNACL_OPENMP_VECTOR_MIN_SIZE) -#endif -VIENNACL_INNER_PROD_IMPL_2(unsigned char) - - -// short -VIENNACL_INNER_PROD_IMPL_1(short, int) -#ifdef VIENNACL_WITH_OPENMP - #pragma omp parallel for reduction(+: temp) if (size1 > VIENNACL_OPENMP_VECTOR_MIN_SIZE) -#endif -VIENNACL_INNER_PROD_IMPL_2(short) - -VIENNACL_INNER_PROD_IMPL_1(unsigned short, int) -#ifdef VIENNACL_WITH_OPENMP - #pragma omp parallel for reduction(+: temp) if (size1 > VIENNACL_OPENMP_VECTOR_MIN_SIZE) -#endif -VIENNACL_INNER_PROD_IMPL_2(unsigned short) - - -// int -VIENNACL_INNER_PROD_IMPL_1(int, int) -#ifdef VIENNACL_WITH_OPENMP - #pragma omp parallel for reduction(+: temp) if (size1 > VIENNACL_OPENMP_VECTOR_MIN_SIZE) -#endif -VIENNACL_INNER_PROD_IMPL_2(int) - -VIENNACL_INNER_PROD_IMPL_1(unsigned int, unsigned int) -#ifdef VIENNACL_WITH_OPENMP - #pragma omp parallel for reduction(+: temp) if (size1 > VIENNACL_OPENMP_VECTOR_MIN_SIZE) -#endif -VIENNACL_INNER_PROD_IMPL_2(unsigned int) - - -// long -VIENNACL_INNER_PROD_IMPL_1(long, long) -#ifdef VIENNACL_WITH_OPENMP - #pragma omp parallel for reduction(+: temp) if (size1 > VIENNACL_OPENMP_VECTOR_MIN_SIZE) -#endif -VIENNACL_INNER_PROD_IMPL_2(long) - -VIENNACL_INNER_PROD_IMPL_1(unsigned long, unsigned long) -#ifdef VIENNACL_WITH_OPENMP - #pragma omp parallel for reduction(+: temp) if (size1 > VIENNACL_OPENMP_VECTOR_MIN_SIZE) -#endif -VIENNACL_INNER_PROD_IMPL_2(unsigned long) - - -// float -VIENNACL_INNER_PROD_IMPL_1(float, float) -#ifdef VIENNACL_WITH_OPENMP - #pragma omp parallel for reduction(+: temp) if (size1 > VIENNACL_OPENMP_VECTOR_MIN_SIZE) -#endif -VIENNACL_INNER_PROD_IMPL_2(float) - -// double -VIENNACL_INNER_PROD_IMPL_1(double, double) -#ifdef VIENNACL_WITH_OPENMP - #pragma omp parallel for reduction(+: temp) if (size1 > VIENNACL_OPENMP_VECTOR_MIN_SIZE) -#endif -VIENNACL_INNER_PROD_IMPL_2(double) - -#undef VIENNACL_INNER_PROD_IMPL_1 -#undef VIENNACL_INNER_PROD_IMPL_2 -} - -/** @brief Computes the inner product of two vectors - implementation. Library users should call inner_prod(vec1, vec2). -* -* @param vec1 The first vector -* @param vec2 The second vector -* @param result The result scalar (on the gpu) -*/ -template<typename NumericT, typename ScalarT> -void inner_prod_impl(vector_base<NumericT> const & vec1, - vector_base<NumericT> const & vec2, - ScalarT & result) -{ - typedef NumericT value_type; - - value_type const * data_vec1 = detail::extract_raw_pointer<value_type>(vec1); - value_type const * data_vec2 = detail::extract_raw_pointer<value_type>(vec2); - - vcl_size_t start1 = viennacl::traits::start(vec1); - vcl_size_t inc1 = viennacl::traits::stride(vec1); - vcl_size_t size1 = viennacl::traits::size(vec1); - - vcl_size_t start2 = viennacl::traits::start(vec2); - vcl_size_t inc2 = viennacl::traits::stride(vec2); - - result = detail::inner_prod_impl(data_vec1, start1, inc1, size1, - data_vec2, start2, inc2); //Note: Assignment to result might be expensive, thus a temporary is introduced here -} - -template<typename NumericT> -void inner_prod_impl(vector_base<NumericT> const & x, - vector_tuple<NumericT> const & vec_tuple, - vector_base<NumericT> & result) -{ - typedef NumericT value_type; - - value_type const * data_x = detail::extract_raw_pointer<value_type>(x); - - vcl_size_t start_x = viennacl::traits::start(x); - vcl_size_t inc_x = viennacl::traits::stride(x); - vcl_size_t size_x = viennacl::traits::size(x); - - std::vector<value_type> temp(vec_tuple.const_size()); - std::vector<value_type const *> data_y(vec_tuple.const_size()); - std::vector<vcl_size_t> start_y(vec_tuple.const_size()); - std::vector<vcl_size_t> stride_y(vec_tuple.const_size()); - - for (vcl_size_t j=0; j<vec_tuple.const_size(); ++j) - { - data_y[j] = detail::extract_raw_pointer<value_type>(vec_tuple.const_at(j)); - start_y[j] = viennacl::traits::start(vec_tuple.const_at(j)); - stride_y[j] = viennacl::traits::stride(vec_tuple.const_at(j)); - } - - // Note: No OpenMP here because it cannot perform a reduction on temp-array. Savings in memory bandwidth are expected to still justify this approach... - for (vcl_size_t i = 0; i < size_x; ++i) - { - value_type entry_x = data_x[i*inc_x+start_x]; - for (vcl_size_t j=0; j < vec_tuple.const_size(); ++j) - temp[j] += entry_x * data_y[j][i*stride_y[j]+start_y[j]]; - } - - for (vcl_size_t j=0; j < vec_tuple.const_size(); ++j) - result[j] = temp[j]; //Note: Assignment to result might be expensive, thus 'temp' is used for accumulation -} - - -namespace detail -{ - -#define VIENNACL_NORM_1_IMPL_1(RESULTSCALART, TEMPSCALART) \ - inline RESULTSCALART norm_1_impl(RESULTSCALART const * data_vec1, vcl_size_t start1, vcl_size_t inc1, vcl_size_t size1) { \ - TEMPSCALART temp = 0; - -#define VIENNACL_NORM_1_IMPL_2(RESULTSCALART, TEMPSCALART) \ - for (long i = 0; i < static_cast<long>(size1); ++i) \ - temp += static_cast<TEMPSCALART>(std::fabs(static_cast<double>(data_vec1[static_cast<vcl_size_t>(i)*inc1+start1]))); \ - return static_cast<RESULTSCALART>(temp); \ - } - -// char -VIENNACL_NORM_1_IMPL_1(char, int) -#ifdef VIENNACL_WITH_OPENMP - #pragma omp parallel for reduction(+: temp) if (size1 > VIENNACL_OPENMP_VECTOR_MIN_SIZE) -#endif -VIENNACL_NORM_1_IMPL_2(char, int) - -VIENNACL_NORM_1_IMPL_1(unsigned char, int) -#ifdef VIENNACL_WITH_OPENMP - #pragma omp parallel for reduction(+: temp) if (size1 > VIENNACL_OPENMP_VECTOR_MIN_SIZE) -#endif -VIENNACL_NORM_1_IMPL_2(unsigned char, int) - -// short -VIENNACL_NORM_1_IMPL_1(short, int) -#ifdef VIENNACL_WITH_OPENMP - #pragma omp parallel for reduction(+: temp) if (size1 > VIENNACL_OPENMP_VECTOR_MIN_SIZE) -#endif -VIENNACL_NORM_1_IMPL_2(short, int) - -VIENNACL_NORM_1_IMPL_1(unsigned short, int) -#ifdef VIENNACL_WITH_OPENMP - #pragma omp parallel for reduction(+: temp) if (size1 > VIENNACL_OPENMP_VECTOR_MIN_SIZE) -#endif -VIENNACL_NORM_1_IMPL_2(unsigned short, int) - - -// int -VIENNACL_NORM_1_IMPL_1(int, int) -#ifdef VIENNACL_WITH_OPENMP - #pragma omp parallel for reduction(+: temp) if (size1 > VIENNACL_OPENMP_VECTOR_MIN_SIZE) -#endif -VIENNACL_NORM_1_IMPL_2(int, int) - -VIENNACL_NORM_1_IMPL_1(unsigned int, unsigned int) -#ifdef VIENNACL_WITH_OPENMP - #pragma omp parallel for reduction(+: temp) if (size1 > VIENNACL_OPENMP_VECTOR_MIN_SIZE) -#endif -VIENNACL_NORM_1_IMPL_2(unsigned int, unsigned int) - - -// long -VIENNACL_NORM_1_IMPL_1(long, long) -#ifdef VIENNACL_WITH_OPENMP - #pragma omp parallel for reduction(+: temp) if (size1 > VIENNACL_OPENMP_VECTOR_MIN_SIZE) -#endif -VIENNACL_NORM_1_IMPL_2(long, long) - -VIENNACL_NORM_1_IMPL_1(unsigned long, unsigned long) -#ifdef VIENNACL_WITH_OPENMP - #pragma omp parallel for reduction(+: temp) if (size1 > VIENNACL_OPENMP_VECTOR_MIN_SIZE) -#endif -VIENNACL_NORM_1_IMPL_2(unsigned long, unsigned long) - - -// float -VIENNACL_NORM_1_IMPL_1(float, float) -#ifdef VIENNACL_WITH_OPENMP - #pragma omp parallel for reduction(+: temp) if (size1 > VIENNACL_OPENMP_VECTOR_MIN_SIZE) -#endif -VIENNACL_NORM_1_IMPL_2(float, float) - -// double -VIENNACL_NORM_1_IMPL_1(double, double) -#ifdef VIENNACL_WITH_OPENMP - #pragma omp parallel for reduction(+: temp) if (size1 > VIENNACL_OPENMP_VECTOR_MIN_SIZE) -#endif -VIENNACL_NORM_1_IMPL_2(double, double) - -#undef VIENNACL_NORM_1_IMPL_1 -#undef VIENNACL_NORM_1_IMPL_2 - -} - -/** @brief Computes the l^1-norm of a vector -* -* @param vec1 The vector -* @param result The result scalar -*/ -template<typename NumericT, typename ScalarT> -void norm_1_impl(vector_base<NumericT> const & vec1, - ScalarT & result) -{ - typedef NumericT value_type; - - value_type const * data_vec1 = detail::extract_raw_pointer<value_type>(vec1); - - vcl_size_t start1 = viennacl::traits::start(vec1); - vcl_size_t inc1 = viennacl::traits::stride(vec1); - vcl_size_t size1 = viennacl::traits::size(vec1); - - result = detail::norm_1_impl(data_vec1, start1, inc1, size1); //Note: Assignment to result might be expensive, thus using a temporary for accumulation -} - - - -namespace detail -{ - -#define VIENNACL_NORM_2_IMPL_1(RESULTSCALART, TEMPSCALART) \ - inline RESULTSCALART norm_2_impl(RESULTSCALART const * data_vec1, vcl_size_t start1, vcl_size_t inc1, vcl_size_t size1) { \ - TEMPSCALART temp = 0; - -#define VIENNACL_NORM_2_IMPL_2(RESULTSCALART, TEMPSCALART) \ - for (long i = 0; i < static_cast<long>(size1); ++i) { \ - RESULTSCALART data = data_vec1[static_cast<vcl_size_t>(i)*inc1+start1]; \ - temp += static_cast<TEMPSCALART>(data * data); \ - } \ - return static_cast<RESULTSCALART>(temp); \ - } - -// char -VIENNACL_NORM_2_IMPL_1(char, int) -#ifdef VIENNACL_WITH_OPENMP - #pragma omp parallel for reduction(+: temp) if (size1 > VIENNACL_OPENMP_VECTOR_MIN_SIZE) -#endif -VIENNACL_NORM_2_IMPL_2(char, int) - -VIENNACL_NORM_2_IMPL_1(unsigned char, int) -#ifdef VIENNACL_WITH_OPENMP - #pragma omp parallel for reduction(+: temp) if (size1 > VIENNACL_OPENMP_VECTOR_MIN_SIZE) -#endif -VIENNACL_NORM_2_IMPL_2(unsigned char, int) - - -// short -VIENNACL_NORM_2_IMPL_1(short, int) -#ifdef VIENNACL_WITH_OPENMP - #pragma omp parallel for reduction(+: temp) if (size1 > VIENNACL_OPENMP_VECTOR_MIN_SIZE) -#endif -VIENNACL_NORM_2_IMPL_2(short, int) - -VIENNACL_NORM_2_IMPL_1(unsigned short, int) -#ifdef VIENNACL_WITH_OPENMP - #pragma omp parallel for reduction(+: temp) if (size1 > VIENNACL_OPENMP_VECTOR_MIN_SIZE) -#endif -VIENNACL_NORM_2_IMPL_2(unsigned short, int) - - -// int -VIENNACL_NORM_2_IMPL_1(int, int) -#ifdef VIENNACL_WITH_OPENMP - #pragma omp parallel for reduction(+: temp) if (size1 > VIENNACL_OPENMP_VECTOR_MIN_SIZE) -#endif -VIENNACL_NORM_2_IMPL_2(int, int) - -VIENNACL_NORM_2_IMPL_1(unsigned int, unsigned int) -#ifdef VIENNACL_WITH_OPENMP - #pragma omp parallel for reduction(+: temp) if (size1 > VIENNACL_OPENMP_VECTOR_MIN_SIZE) -#endif -VIENNACL_NORM_2_IMPL_2(unsigned int, unsigned int) - - -// long -VIENNACL_NORM_2_IMPL_1(long, long) -#ifdef VIENNACL_WITH_OPENMP - #pragma omp parallel for reduction(+: temp) if (size1 > VIENNACL_OPENMP_VECTOR_MIN_SIZE) -#endif -VIENNACL_NORM_2_IMPL_2(long, long) - -VIENNACL_NORM_2_IMPL_1(unsigned long, unsigned long) -#ifdef VIENNACL_WITH_OPENMP - #pragma omp parallel for reduction(+: temp) if (size1 > VIENNACL_OPENMP_VECTOR_MIN_SIZE) -#endif -VIENNACL_NORM_2_IMPL_2(unsigned long, unsigned long) - - -// float -VIENNACL_NORM_2_IMPL_1(float, float) -#ifdef VIENNACL_WITH_OPENMP - #pragma omp parallel for reduction(+: temp) if (size1 > VIENNACL_OPENMP_VECTOR_MIN_SIZE) -#endif -VIENNACL_NORM_2_IMPL_2(float, float) - -// double -VIENNACL_NORM_2_IMPL_1(double, double) -#ifdef VIENNACL_WITH_OPENMP - #pragma omp parallel for reduction(+: temp) if (size1 > VIENNACL_OPENMP_VECTOR_MIN_SIZE) -#endif -VIENNACL_NORM_2_IMPL_2(double, double) - -#undef VIENNACL_NORM_2_IMPL_1 -#undef VIENNACL_NORM_2_IMPL_2 - -} - - -/** @brief Computes the l^2-norm of a vector - implementation -* -* @param vec1 The vector -* @param result The result scalar -*/ -template<typename NumericT, typename ScalarT> -void norm_2_impl(vector_base<NumericT> const & vec1, - ScalarT & result) -{ - typedef NumericT value_type; - - value_type const * data_vec1 = detail::extract_raw_pointer<value_type>(vec1); - - vcl_size_t start1 = viennacl::traits::start(vec1); - vcl_size_t inc1 = viennacl::traits::stride(vec1); - vcl_size_t size1 = viennacl::traits::size(vec1); - - result = std::sqrt(detail::norm_2_impl(data_vec1, start1, inc1, size1)); //Note: Assignment to result might be expensive, thus 'temp' is used for accumulation -} - -/** @brief Computes the supremum-norm of a vector -* -* @param vec1 The vector -* @param result The result scalar -*/ -template<typename NumericT, typename ScalarT> -void norm_inf_impl(vector_base<NumericT> const & vec1, - ScalarT & result) -{ - typedef NumericT value_type; - - value_type const * data_vec1 = detail::extract_raw_pointer<value_type>(vec1); - - vcl_size_t start1 = viennacl::traits::start(vec1); - vcl_size_t inc1 = viennacl::traits::stride(vec1); - vcl_size_t size1 = viennacl::traits::size(vec1); - - vcl_size_t thread_count=1; - - #ifdef VIENNACL_WITH_OPENMP - if(size1 > VIENNACL_OPENMP_VECTOR_MIN_SIZE) - thread_count = omp_get_max_threads(); - #endif - - std::vector<value_type> temp(thread_count); - -#ifdef VIENNACL_WITH_OPENMP - #pragma omp parallel if (size1 > VIENNACL_OPENMP_VECTOR_MIN_SIZE) -#endif - { - vcl_size_t id = 0; -#ifdef VIENNACL_WITH_OPENMP - id = omp_get_thread_num(); -#endif - - vcl_size_t begin = (size1 * id) / thread_count; - vcl_size_t end = (size1 * (id + 1)) / thread_count; - temp[id] = 0; - - for (vcl_size_t i = begin; i < end; ++i) - temp[id] = std::max<value_type>(temp[id], static_cast<value_type>(std::fabs(static_cast<double>(data_vec1[i*inc1+start1])))); //casting to double in order to avoid problems if T is an integer type - } - for (vcl_size_t i = 1; i < thread_count; ++i) - temp[0] = std::max<value_type>( temp[0], temp[i]); - result = temp[0]; -} - -//This function should return a CPU scalar, otherwise statements like -// vcl_rhs[index_norm_inf(vcl_rhs)] -// are ambiguous -/** @brief Computes the index of the first entry that is equal to the supremum-norm in modulus. -* -* @param vec1 The vector -* @return The result. Note that the result must be a CPU scalar (unsigned int), since gpu scalars are floating point types. -*/ -template<typename NumericT> -vcl_size_t index_norm_inf(vector_base<NumericT> const & vec1) -{ - typedef NumericT value_type; - - value_type const * data_vec1 = detail::extract_raw_pointer<value_type>(vec1); - - vcl_size_t start1 = viennacl::traits::start(vec1); - vcl_size_t inc1 = viennacl::traits::stride(vec1); - vcl_size_t size1 = viennacl::traits::size(vec1); - vcl_size_t thread_count=1; - -#ifdef VIENNACL_WITH_OPENMP - if(size1 > VIENNACL_OPENMP_VECTOR_MIN_SIZE) - thread_count = omp_get_max_threads(); -#endif - - std::vector<value_type> temp(thread_count); - std::vector<vcl_size_t> index(thread_count); - -#ifdef VIENNACL_WITH_OPENMP - #pragma omp parallel if (size1 > VIENNACL_OPENMP_VECTOR_MIN_SIZE) -#endif - { - vcl_size_t id = 0; -#ifdef VIENNACL_WITH_OPENMP - id = omp_get_thread_num(); -#endif - vcl_size_t begin = (size1 * id) / thread_count; - vcl_size_t end = (size1 * (id + 1)) / thread_count; - index[id] = start1; - temp[id] = 0; - value_type data; - - for (vcl_size_t i = begin; i < end; ++i) - { - data = static_cast<value_type>(std::fabs(static_cast<double>(data_vec1[i*inc1+start1]))); //casting to double in order to avoid problems if T is an integer type - if (data > temp[id]) - { - index[id] = i; - temp[id] = data; - } - } - } - for (vcl_size_t i = 1; i < thread_count; ++i) - { - if (temp[i] > temp[0]) - { - index[0] = index[i]; - temp[0] = temp[i]; - } - } - return index[0]; -} - -/** @brief Computes the maximum of a vector -* -* @param vec1 The vector -* @param result The result scalar -*/ -template<typename NumericT, typename ScalarT> -void max_impl(vector_base<NumericT> const & vec1, - ScalarT & result) -{ - typedef NumericT value_type; - - value_type const * data_vec1 = detail::extract_raw_pointer<value_type>(vec1); - - vcl_size_t start1 = viennacl::traits::start(vec1); - vcl_size_t inc1 = viennacl::traits::stride(vec1); - vcl_size_t size1 = viennacl::traits::size(vec1); - - vcl_size_t thread_count=1; - -#ifdef VIENNACL_WITH_OPENMP - if(size1 > VIENNACL_OPENMP_VECTOR_MIN_SIZE) - thread_count = omp_get_max_threads(); -#endif - - std::vector<value_type> temp(thread_count); - -#ifdef VIENNACL_WITH_OPENMP - #pragma omp parallel if (size1 > VIENNACL_OPENMP_VECTOR_MIN_SIZE) -#endif - { - vcl_size_t id = 0; -#ifdef VIENNACL_WITH_OPENMP - id = omp_get_thread_num(); -#endif - vcl_size_t begin = (size1 * id) / thread_count; - vcl_size_t end = (size1 * (id + 1)) / thread_count; - temp[id] = data_vec1[start1]; - - for (vcl_size_t i = begin; i < end; ++i) - { - value_type v = data_vec1[i*inc1+start1];//Note: Assignment to 'vec1' in std::min might be expensive, thus 'v' is used for the function - temp[id] = std::max<value_type>(temp[id],v); - } - } - for (vcl_size_t i = 1; i < thread_count; ++i) - temp[0] = std::max<value_type>( temp[0], temp[i]); - result = temp[0];//Note: Assignment to result might be expensive, thus 'temp' is used for accumulation -} - -/** @brief Computes the minimum of a vector -* -* @param vec1 The vector -* @param result The result scalar -*/ -template<typename NumericT, typename ScalarT> -void min_impl(vector_base<NumericT> const & vec1, - ScalarT & result) -{ - typedef NumericT value_type; - - value_type const * data_vec1 = detail::extract_raw_pointer<value_type>(vec1); - - vcl_size_t start1 = viennacl::traits::start(vec1); - vcl_size_t inc1 = viennacl::traits::stride(vec1); - vcl_size_t size1 = viennacl::traits::size(vec1); - - vcl_size_t thread_count=1; - -#ifdef VIENNACL_WITH_OPENMP - if(size1 > VIENNACL_OPENMP_VECTOR_MIN_SIZE) - thread_count = omp_get_max_threads(); -#endif - - std::vector<value_type> temp(thread_count); - -#ifdef VIENNACL_WITH_OPENMP - #pragma omp parallel if (size1 > VIENNACL_OPENMP_VECTOR_MIN_SIZE) -#endif - { - vcl_size_t id = 0; -#ifdef VIENNACL_WITH_OPENMP - id = omp_get_thread_num(); -#endif - vcl_size_t begin = (size1 * id) / thread_count; - vcl_size_t end = (size1 * (id + 1)) / thread_count; - temp[id] = data_vec1[start1]; - - for (vcl_size_t i = begin; i < end; ++i) - { - value_type v = data_vec1[i*inc1+start1];//Note: Assignment to 'vec1' in std::min might be expensive, thus 'v' is used for the function - temp[id] = std::min<value_type>(temp[id],v); - } - } - for (vcl_size_t i = 1; i < thread_count; ++i) - temp[0] = std::min<value_type>( temp[0], temp[i]); - result = temp[0];//Note: Assignment to result might be expensive, thus 'temp' is used for accumulation -} - -/** @brief Computes the sum of all elements from the vector -* -* @param vec1 The vector -* @param result The result scalar -*/ -template<typename NumericT, typename ScalarT> -void sum_impl(vector_base<NumericT> const & vec1, - ScalarT & result) -{ - typedef NumericT value_type; - - value_type const * data_vec1 = detail::extract_raw_pointer<value_type>(vec1); - - vcl_size_t start1 = viennacl::traits::start(vec1); - vcl_size_t inc1 = viennacl::traits::stride(vec1); - vcl_size_t size1 = viennacl::traits::size(vec1); - - value_type temp = 0; -#ifdef VIENNACL_WITH_OPENMP - #pragma omp parallel for reduction(+:temp) if (size1 > VIENNACL_OPENMP_VECTOR_MIN_SIZE) -#endif - for (long i = 0; i < static_cast<long>(size1); ++i) - temp += data_vec1[static_cast<vcl_size_t>(i)*inc1+start1]; - - result = temp; //Note: Assignment to result might be expensive, thus 'temp' is used for accumulation -} - -/** @brief Computes a plane rotation of two vectors. -* -* Computes (x,y) <- (alpha * x + beta * y, -beta * x + alpha * y) -* -* @param vec1 The first vector -* @param vec2 The second vector -* @param alpha The first transformation coefficient -* @param beta The second transformation coefficient -*/ -template<typename NumericT> -void plane_rotation(vector_base<NumericT> & vec1, - vector_base<NumericT> & vec2, - NumericT alpha, NumericT beta) -{ - typedef NumericT value_type; - - value_type * data_vec1 = detail::extract_raw_pointer<value_type>(vec1); - value_type * data_vec2 = detail::extract_raw_pointer<value_type>(vec2); - - vcl_size_t start1 = viennacl::traits::start(vec1); - vcl_size_t inc1 = viennacl::traits::stride(vec1); - vcl_size_t size1 = viennacl::traits::size(vec1); - - vcl_size_t start2 = viennacl::traits::start(vec2); - vcl_size_t inc2 = viennacl::traits::stride(vec2); - - value_type data_alpha = alpha; - value_type data_beta = beta; - -#ifdef VIENNACL_WITH_OPENMP - #pragma omp parallel for if (size1 > VIENNACL_OPENMP_VECTOR_MIN_SIZE) -#endif - for (long i = 0; i < static_cast<long>(size1); ++i) - { - value_type temp1 = data_vec1[static_cast<vcl_size_t>(i)*inc1+start1]; - value_type temp2 = data_vec2[static_cast<vcl_size_t>(i)*inc2+start2]; - - data_vec1[static_cast<vcl_size_t>(i)*inc1+start1] = data_alpha * temp1 + data_beta * temp2; - data_vec2[static_cast<vcl_size_t>(i)*inc2+start2] = data_alpha * temp2 - data_beta * temp1; - } -} - -namespace detail -{ - /** @brief Implementation of inclusive_scan and exclusive_scan for the host (OpenMP) backend. */ - template<typename NumericT> - void vector_scan_impl(vector_base<NumericT> const & vec1, - vector_base<NumericT> & vec2, - bool is_inclusive) - { - NumericT const * data_vec1 = detail::extract_raw_pointer<NumericT>(vec1); - NumericT * data_vec2 = detail::extract_raw_pointer<NumericT>(vec2); - - vcl_size_t start1 = viennacl::traits::start(vec1); - vcl_size_t inc1 = viennacl::traits::stride(vec1); - vcl_size_t size1 = viennacl::traits::size(vec1); - if (size1 < 1) - return; - - vcl_size_t start2 = viennacl::traits::start(vec2); - vcl_size_t inc2 = viennacl::traits::stride(vec2); - -#ifdef VIENNACL_WITH_OPENMP - if (size1 > VIENNACL_OPENMP_VECTOR_MIN_SIZE) - { - std::vector<NumericT> thread_results(omp_get_max_threads()); - - // inclusive scan each thread segment: - #pragma omp parallel - { - vcl_size_t work_per_thread = (size1 - 1) / thread_results.size() + 1; - vcl_size_t thread_start = work_per_thread * omp_get_thread_num(); - vcl_size_t thread_stop = std::min<vcl_size_t>(thread_start + work_per_thread, size1); - - NumericT thread_sum = 0; - for(vcl_size_t i = thread_start; i < thread_stop; i++) - thread_sum += data_vec1[i * inc1 + start1]; - - thread_results[omp_get_thread_num()] = thread_sum; - } - - // exclusive-scan of thread results: - NumericT current_offset = 0; - for (vcl_size_t i=0; i<thread_results.size(); ++i) - { - NumericT tmp = thread_results[i]; - thread_results[i] = current_offset; - current_offset += tmp; - } - - // exclusive/inclusive scan of each segment with correct offset: - #pragma omp parallel - { - vcl_size_t work_per_thread = (size1 - 1) / thread_results.size() + 1; - vcl_size_t thread_start = work_per_thread * omp_get_thread_num(); - vcl_size_t thread_stop = std::min<vcl_size_t>(thread_start + work_per_thread, size1); - - NumericT thread_sum = thread_results[omp_get_thread_num()]; - if (is_inclusive) - { - for(vcl_size_t i = thread_start; i < thread_stop; i++) - { - thread_sum += data_vec1[i * inc1 + start1]; - data_vec2[i * inc2 + start2] = thread_sum; - } - } - else - { - for(vcl_size_t i = thread_start; i < thread_stop; i++) - { - NumericT tmp = data_vec1[i * inc1 + start1]; - data_vec2[i * inc2 + start2] = thread_sum; - thread_sum += tmp; - } - } - } - } else -#endif - { - NumericT sum = 0; - if (is_inclusive) - { - for(vcl_size_t i = 0; i < size1; i++) - { - sum += data_vec1[i * inc1 + start1]; - data_vec2[i * inc2 + start2] = sum; - } - } - else - { - for(vcl_size_t i = 0; i < size1; i++) - { - NumericT tmp = data_vec1[i * inc1 + start1]; - data_vec2[i * inc2 + start2] = sum; - sum += tmp; - } - } - } - - } -} - -/** @brief This function implements an inclusive scan on the host using OpenMP. -* -* Given an element vector (x_0, x_1, ..., x_{n-1}), -* this routine computes (x_0, x_0 + x_1, ..., x_0 + x_1 + ... + x_{n-1}) -* -* @param vec1 Input vector: Gets overwritten by the routine. -* @param vec2 The output vector. Either idential to vec1 or non-overlapping. -*/ -template<typename NumericT> -void inclusive_scan(vector_base<NumericT> const & vec1, - vector_base<NumericT> & vec2) -{ - detail::vector_scan_impl(vec1, vec2, true); -} - -/** @brief This function implements an exclusive scan on the host using OpenMP. -* -* Given an element vector (x_0, x_1, ..., x_{n-1}), -* this routine computes (0, x_0, x_0 + x_1, ..., x_0 + x_1 + ... + x_{n-2}) -* -* @param vec1 Input vector: Gets overwritten by the routine. -* @param vec2 The output vector. Either idential to vec1 or non-overlapping. -*/ -template<typename NumericT> -void exclusive_scan(vector_base<NumericT> const & vec1, - vector_base<NumericT> & vec2) -{ - detail::vector_scan_impl(vec1, vec2, false); -} - - -} //namespace host_based -} //namespace linalg -} //namespace viennacl - - -#endif http://git-wip-us.apache.org/repos/asf/mahout/blob/7ae549fa/native-viennaCL/src/main/cpp/viennacl/linalg/ichol.hpp ---------------------------------------------------------------------- diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/ichol.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/ichol.hpp deleted file mode 100644 index 1038b2b..0000000 --- a/native-viennaCL/src/main/cpp/viennacl/linalg/ichol.hpp +++ /dev/null @@ -1,228 +0,0 @@ -#ifndef VIENNACL_LINALG_ICHOL_HPP_ -#define VIENNACL_LINALG_ICHOL_HPP_ - -/* ========================================================================= - Copyright (c) 2010-2016, Institute for Microelectronics, - Institute for Analysis and Scientific Computing, - TU Wien. - Portions of this software are copyright by UChicago Argonne, LLC. - - ----------------- - ViennaCL - The Vienna Computing Library - ----------------- - - Project Head: Karl Rupp [email protected] - - (A list of authors and contributors can be found in the manual) - - License: MIT (X11), see file LICENSE in the base directory -============================================================================= */ - -/** @file viennacl/linalg/ichol.hpp - @brief Implementations of incomplete Cholesky factorization preconditioners with static nonzero pattern. -*/ - -#include <vector> -#include <cmath> -#include <iostream> -#include "viennacl/forwards.h" -#include "viennacl/tools/tools.hpp" -#include "viennacl/compressed_matrix.hpp" - -#include "viennacl/linalg/host_based/common.hpp" - -#include <map> - -namespace viennacl -{ -namespace linalg -{ - -/** @brief A tag for incomplete Cholesky factorization with static pattern (ILU0) -*/ -class ichol0_tag {}; - - -/** @brief Implementation of a ILU-preconditioner with static pattern. Optimized version for CSR matrices. - * - * Refer to Chih-Jen Lin and Jorge J. Moré, Incomplete Cholesky Factorizations with Limited Memory, SIAM J. Sci. Comput., 21(1), 24â45 - * for one of many descriptions of incomplete Cholesky Factorizations - * - * @param A The input matrix in CSR format - * // param tag An ichol0_tag in order to dispatch among several other preconditioners. - */ -template<typename NumericT> -void precondition(viennacl::compressed_matrix<NumericT> & A, ichol0_tag const & /* tag */) -{ - assert( (viennacl::traits::context(A).memory_type() == viennacl::MAIN_MEMORY) && bool("System matrix must reside in main memory for ICHOL0") ); - - NumericT * elements = viennacl::linalg::host_based::detail::extract_raw_pointer<NumericT>(A.handle()); - unsigned int const * row_buffer = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(A.handle1()); - unsigned int const * col_buffer = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(A.handle2()); - - //std::cout << A.size1() << std::endl; - for (vcl_size_t i=0; i<A.size1(); ++i) - { - unsigned int row_i_begin = row_buffer[i]; - unsigned int row_i_end = row_buffer[i+1]; - - // get a_ii: - NumericT a_ii = 0; - for (unsigned int buf_index_aii = row_i_begin; buf_index_aii < row_i_end; ++buf_index_aii) - { - if (col_buffer[buf_index_aii] == i) - { - a_ii = std::sqrt(elements[buf_index_aii]); - elements[buf_index_aii] = a_ii; - break; - } - } - - // Now scale column/row i, i.e. A(k, i) /= A(i, i) - for (unsigned int buf_index_aii = row_i_begin; buf_index_aii < row_i_end; ++buf_index_aii) - { - if (col_buffer[buf_index_aii] > i) - elements[buf_index_aii] /= a_ii; - } - - // Now compute A(k, j) -= A(k, i) * A(j, i) for all nonzero k, j in column i: - for (unsigned int buf_index_j = row_i_begin; buf_index_j < row_i_end; ++buf_index_j) - { - unsigned int j = col_buffer[buf_index_j]; - if (j <= i) - continue; - - NumericT a_ji = elements[buf_index_j]; - - for (unsigned int buf_index_k = row_i_begin; buf_index_k < row_i_end; ++buf_index_k) - { - unsigned int k = col_buffer[buf_index_k]; - if (k < j) - continue; - - NumericT a_ki = elements[buf_index_k]; - - //Now check whether A(k, j) is in nonzero pattern: - unsigned int row_j_begin = row_buffer[j]; - unsigned int row_j_end = row_buffer[j+1]; - for (unsigned int buf_index_kj = row_j_begin; buf_index_kj < row_j_end; ++buf_index_kj) - { - if (col_buffer[buf_index_kj] == k) - { - elements[buf_index_kj] -= a_ki * a_ji; - break; - } - } - } - } - - } - -} - - -/** @brief Incomplete Cholesky preconditioner class with static pattern (ICHOL0), can be supplied to solve()-routines -*/ -template<typename MatrixT> -class ichol0_precond -{ - typedef typename MatrixT::value_type NumericType; - -public: - ichol0_precond(MatrixT const & mat, ichol0_tag const & tag) : tag_(tag), LLT(mat.size1(), mat.size2(), viennacl::context(viennacl::MAIN_MEMORY)) - { - //initialize preconditioner: - //std::cout << "Start CPU precond" << std::endl; - init(mat); - //std::cout << "End CPU precond" << std::endl; - } - - template<typename VectorT> - void apply(VectorT & vec) const - { - unsigned int const * row_buffer = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(LLT.handle1()); - unsigned int const * col_buffer = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(LLT.handle2()); - NumericType const * elements = viennacl::linalg::host_based::detail::extract_raw_pointer<NumericType>(LLT.handle()); - - // Note: L is stored in a column-oriented fashion, i.e. transposed w.r.t. the row-oriented layout. Thus, the factorization A = L L^T holds L in the upper triangular part of A. - viennacl::linalg::host_based::detail::csr_trans_inplace_solve<NumericType>(row_buffer, col_buffer, elements, vec, LLT.size2(), lower_tag()); - viennacl::linalg::host_based::detail::csr_inplace_solve<NumericType>(row_buffer, col_buffer, elements, vec, LLT.size2(), upper_tag()); - } - -private: - void init(MatrixT const & mat) - { - viennacl::context host_ctx(viennacl::MAIN_MEMORY); - viennacl::switch_memory_context(LLT, host_ctx); - - viennacl::copy(mat, LLT); - viennacl::linalg::precondition(LLT, tag_); - } - - ichol0_tag const & tag_; - viennacl::compressed_matrix<NumericType> LLT; -}; - - -/** @brief ILU0 preconditioner class, can be supplied to solve()-routines. -* -* Specialization for compressed_matrix -*/ -template<typename NumericT, unsigned int AlignmentV> -class ichol0_precond< compressed_matrix<NumericT, AlignmentV> > -{ - typedef compressed_matrix<NumericT, AlignmentV> MatrixType; - -public: - ichol0_precond(MatrixType const & mat, ichol0_tag const & tag) : tag_(tag), LLT(mat.size1(), mat.size2(), viennacl::traits::context(mat)) - { - //initialize preconditioner: - //std::cout << "Start GPU precond" << std::endl; - init(mat); - //std::cout << "End GPU precond" << std::endl; - } - - void apply(vector<NumericT> & vec) const - { - if (viennacl::traits::context(vec).memory_type() != viennacl::MAIN_MEMORY) - { - viennacl::context host_ctx(viennacl::MAIN_MEMORY); - viennacl::context old_ctx = viennacl::traits::context(vec); - - viennacl::switch_memory_context(vec, host_ctx); - viennacl::linalg::inplace_solve(trans(LLT), vec, lower_tag()); - viennacl::linalg::inplace_solve( LLT , vec, upper_tag()); - viennacl::switch_memory_context(vec, old_ctx); - } - else //apply ILU0 directly: - { - // Note: L is stored in a column-oriented fashion, i.e. transposed w.r.t. the row-oriented layout. Thus, the factorization A = L L^T holds L in the upper triangular part of A. - viennacl::linalg::inplace_solve(trans(LLT), vec, lower_tag()); - viennacl::linalg::inplace_solve( LLT , vec, upper_tag()); - } - } - -private: - void init(MatrixType const & mat) - { - viennacl::context host_ctx(viennacl::MAIN_MEMORY); - viennacl::switch_memory_context(LLT, host_ctx); - LLT = mat; - - viennacl::linalg::precondition(LLT, tag_); - } - - ichol0_tag const & tag_; - viennacl::compressed_matrix<NumericT> LLT; -}; - -} -} - - - - -#endif - - -
