http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/host_based/spgemm_vector.hpp ---------------------------------------------------------------------- diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/host_based/spgemm_vector.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/host_based/spgemm_vector.hpp new file mode 100644 index 0000000..56e3c14 --- /dev/null +++ b/native-viennaCL/src/main/cpp/viennacl/linalg/host_based/spgemm_vector.hpp @@ -0,0 +1,705 @@ +#ifndef VIENNACL_LINALG_HOST_BASED_SPGEMM_VECTOR_HPP_ +#define VIENNACL_LINALG_HOST_BASED_SPGEMM_VECTOR_HPP_ + +/* ========================================================================= + Copyright (c) 2010-2016, Institute for Microelectronics, + Institute for Analysis and Scientific Computing, + TU Wien. + Portions of this software are copyright by UChicago Argonne, LLC. + + ----------------- + ViennaCL - The Vienna Computing Library + ----------------- + + Project Head: Karl Rupp [email protected] + + (A list of authors and contributors can be found in the manual) + + License: MIT (X11), see file LICENSE in the base directory +============================================================================= */ + +/** @file viennacl/linalg/host_based/sparse_matrix_operations.hpp + @brief Implementations of operations using sparse matrices on the CPU using a single thread or OpenMP. +*/ + +#include "viennacl/forwards.h" +#include "viennacl/linalg/host_based/common.hpp" + + +#ifdef VIENNACL_WITH_AVX2 +#include "immintrin.h" +#endif + + +namespace viennacl +{ +namespace linalg +{ +namespace host_based +{ + + + +#ifdef VIENNACL_WITH_AVX2 +inline +unsigned int row_C_scan_symbolic_vector_AVX2(int const *row_indices_B_begin, int const *row_indices_B_end, + int const *B_row_buffer, int const *B_col_buffer, int B_size2, + int *row_C_vector_output) +{ + __m256i avx_all_ones = _mm256_set_epi32(1, 1, 1, 1, 1, 1, 1, 1); + __m256i avx_all_bsize2 = _mm256_set_epi32(B_size2, B_size2, B_size2, B_size2, B_size2, B_size2, B_size2, B_size2); + + __m256i avx_row_indices_offsets = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0); + __m256i avx_load_mask = _mm256_sub_epi32(avx_row_indices_offsets, _mm256_set1_epi32(row_indices_B_end - row_indices_B_begin)); + __m256i avx_load_mask2 = avx_load_mask; + + __m256i avx_row_indices = _mm256_set1_epi32(0); + avx_row_indices = _mm256_mask_i32gather_epi32(avx_row_indices, row_indices_B_begin, avx_row_indices_offsets, avx_load_mask, 4); + avx_load_mask = avx_load_mask2; // reload mask (destroyed by gather) + __m256i avx_row_start = _mm256_mask_i32gather_epi32(avx_all_ones, B_row_buffer, avx_row_indices, avx_load_mask, 4); + avx_load_mask = avx_load_mask2; // reload mask (destroyed by gather) + __m256i avx_row_end = _mm256_mask_i32gather_epi32(avx_all_ones, B_row_buffer+1, avx_row_indices, avx_load_mask, 4); + + avx_load_mask = _mm256_cmpgt_epi32(avx_row_end, avx_row_start); + __m256i avx_index_front = avx_all_bsize2; + avx_index_front = _mm256_mask_i32gather_epi32(avx_index_front, B_col_buffer, avx_row_start, avx_load_mask, 4); + + int *output_ptr = row_C_vector_output; + + while (1) + { + // get minimum index in current front: + __m256i avx_index_min1 = avx_index_front; + __m256i avx_temp = _mm256_permutevar8x32_epi32(avx_index_min1, _mm256_set_epi32(3, 2, 1, 0, 7, 6, 5, 4)); + avx_index_min1 = _mm256_min_epi32(avx_index_min1, avx_temp); // first four elements compared against last four elements + + avx_temp = _mm256_shuffle_epi32(avx_index_min1, int(78)); // 0b01001110 = 78, using shuffle instead of permutevar here because of lower latency + avx_index_min1 = _mm256_min_epi32(avx_index_min1, avx_temp); // first two elements compared against elements three and four (same for upper half of register) + + avx_temp = _mm256_shuffle_epi32(avx_index_min1, int(177)); // 0b10110001 = 177, using shuffle instead of permutevar here because of lower latency + avx_index_min1 = _mm256_min_epi32(avx_index_min1, avx_temp); // now all entries of avx_index_min1 hold the minimum + + int min_index_in_front = ((int*)&avx_index_min1)[0]; + // check for end of merge operation: + if (min_index_in_front == B_size2) + break; + + // write current entry: + *output_ptr = min_index_in_front; + ++output_ptr; + + // advance index front where equal to minimum index: + avx_load_mask = _mm256_cmpeq_epi32(avx_index_front, avx_index_min1); + // first part: set index to B_size2 if equal to minimum index: + avx_temp = _mm256_and_si256(avx_all_bsize2, avx_load_mask); + avx_index_front = _mm256_max_epi32(avx_index_front, avx_temp); + // second part: increment row_start registers where minimum found: + avx_temp = _mm256_and_si256(avx_all_ones, avx_load_mask); //ones only where the minimum was found + avx_row_start = _mm256_add_epi32(avx_row_start, avx_temp); + // third part part: load new data where more entries available: + avx_load_mask = _mm256_cmpgt_epi32(avx_row_end, avx_row_start); + avx_index_front = _mm256_mask_i32gather_epi32(avx_index_front, B_col_buffer, avx_row_start, avx_load_mask, 4); + } + + return static_cast<unsigned int>(output_ptr - row_C_vector_output); +} +#endif + +/** @brief Merges up to IndexNum rows from B into the result buffer. +* +* Because the input buffer also needs to be considered, this routine actually works on an index front of length (IndexNum+1) +**/ +template<unsigned int IndexNum> +unsigned int row_C_scan_symbolic_vector_N(unsigned int const *row_indices_B, + unsigned int const *B_row_buffer, unsigned int const *B_col_buffer, unsigned int B_size2, + unsigned int const *row_C_vector_input, unsigned int const *row_C_vector_input_end, + unsigned int *row_C_vector_output) +{ + unsigned int index_front[IndexNum+1]; + unsigned int const *index_front_start[IndexNum+1]; + unsigned int const *index_front_end[IndexNum+1]; + + // Set up pointers for loading the indices: + for (unsigned int i=0; i<IndexNum; ++i, ++row_indices_B) + { + index_front_start[i] = B_col_buffer + B_row_buffer[*row_indices_B]; + index_front_end[i] = B_col_buffer + B_row_buffer[*row_indices_B + 1]; + } + index_front_start[IndexNum] = row_C_vector_input; + index_front_end[IndexNum] = row_C_vector_input_end; + + // load indices: + for (unsigned int i=0; i<=IndexNum; ++i) + index_front[i] = (index_front_start[i] < index_front_end[i]) ? *index_front_start[i] : B_size2; + + unsigned int *output_ptr = row_C_vector_output; + + while (1) + { + // get minimum index in current front: + unsigned int min_index_in_front = B_size2; + for (unsigned int i=0; i<=IndexNum; ++i) + min_index_in_front = std::min(min_index_in_front, index_front[i]); + + if (min_index_in_front == B_size2) // we're done + break; + + // advance index front where equal to minimum index: + for (unsigned int i=0; i<=IndexNum; ++i) + { + if (index_front[i] == min_index_in_front) + { + index_front_start[i] += 1; + index_front[i] = (index_front_start[i] < index_front_end[i]) ? *index_front_start[i] : B_size2; + } + } + + // write current entry: + *output_ptr = min_index_in_front; + ++output_ptr; + } + + return static_cast<unsigned int>(output_ptr - row_C_vector_output); +} + +struct spgemm_output_write_enabled { static void apply(unsigned int *ptr, unsigned int value) { *ptr = value; } }; +struct spgemm_output_write_disabled { static void apply(unsigned int * , unsigned int ) { } }; + +template<typename OutputWriterT> +unsigned int row_C_scan_symbolic_vector_1(unsigned int const *input1_begin, unsigned int const *input1_end, + unsigned int const *input2_begin, unsigned int const *input2_end, + unsigned int termination_index, + unsigned int *output_begin) +{ + unsigned int *output_ptr = output_begin; + + unsigned int val_1 = (input1_begin < input1_end) ? *input1_begin : termination_index; + unsigned int val_2 = (input2_begin < input2_end) ? *input2_begin : termination_index; + while (1) + { + unsigned int min_index = std::min(val_1, val_2); + + if (min_index == termination_index) + break; + + if (min_index == val_1) + { + ++input1_begin; + val_1 = (input1_begin < input1_end) ? *input1_begin : termination_index; + } + + if (min_index == val_2) + { + ++input2_begin; + val_2 = (input2_begin < input2_end) ? *input2_begin : termination_index; + } + + // write current entry: + OutputWriterT::apply(output_ptr, min_index); // *output_ptr = min_index; if necessary + ++output_ptr; + } + + return static_cast<unsigned int>(output_ptr - output_begin); +} + +inline +unsigned int row_C_scan_symbolic_vector(unsigned int row_start_A, unsigned int row_end_A, unsigned int const *A_col_buffer, + unsigned int const *B_row_buffer, unsigned int const *B_col_buffer, unsigned int B_size2, + unsigned int *row_C_vector_1, unsigned int *row_C_vector_2, unsigned int *row_C_vector_3) +{ + // Trivial case: row length 0: + if (row_start_A == row_end_A) + return 0; + + // Trivial case: row length 1: + if (row_end_A - row_start_A == 1) + { + unsigned int A_col = A_col_buffer[row_start_A]; + return B_row_buffer[A_col + 1] - B_row_buffer[A_col]; + } + + // Optimizations for row length 2: + unsigned int row_C_len = 0; + if (row_end_A - row_start_A == 2) + { + unsigned int A_col_1 = A_col_buffer[row_start_A]; + unsigned int A_col_2 = A_col_buffer[row_start_A + 1]; + return row_C_scan_symbolic_vector_1<spgemm_output_write_disabled>(B_col_buffer + B_row_buffer[A_col_1], B_col_buffer + B_row_buffer[A_col_1 + 1], + B_col_buffer + B_row_buffer[A_col_2], B_col_buffer + B_row_buffer[A_col_2 + 1], + B_size2, + row_C_vector_1); + } + else // for more than two rows we can safely merge the first two: + { +#ifdef VIENNACL_WITH_AVX2 + row_C_len = row_C_scan_symbolic_vector_AVX2((const int*)(A_col_buffer + row_start_A), (const int*)(A_col_buffer + row_end_A), + (const int*)B_row_buffer, (const int*)B_col_buffer, int(B_size2), + (int*)row_C_vector_1); + row_start_A += 8; +#else + unsigned int A_col_1 = A_col_buffer[row_start_A]; + unsigned int A_col_2 = A_col_buffer[row_start_A + 1]; + row_C_len = row_C_scan_symbolic_vector_1<spgemm_output_write_enabled>(B_col_buffer + B_row_buffer[A_col_1], B_col_buffer + B_row_buffer[A_col_1 + 1], + B_col_buffer + B_row_buffer[A_col_2], B_col_buffer + B_row_buffer[A_col_2 + 1], + B_size2, + row_C_vector_1); + row_start_A += 2; +#endif + } + + // all other row lengths: + while (row_end_A > row_start_A) + { +#ifdef VIENNACL_WITH_AVX2 + if (row_end_A - row_start_A > 2) // we deal with one or two remaining rows more efficiently below: + { + unsigned int merged_len = row_C_scan_symbolic_vector_AVX2((const int*)(A_col_buffer + row_start_A), (const int*)(A_col_buffer + row_end_A), + (const int*)B_row_buffer, (const int*)B_col_buffer, int(B_size2), + (int*)row_C_vector_3); + if (row_start_A + 8 >= row_end_A) + row_C_len = row_C_scan_symbolic_vector_1<spgemm_output_write_disabled>(row_C_vector_3, row_C_vector_3 + merged_len, + row_C_vector_1, row_C_vector_1 + row_C_len, + B_size2, + row_C_vector_2); + else + row_C_len = row_C_scan_symbolic_vector_1<spgemm_output_write_enabled>(row_C_vector_3, row_C_vector_3 + merged_len, + row_C_vector_1, row_C_vector_1 + row_C_len, + B_size2, + row_C_vector_2); + row_start_A += 8; + } + else +#endif + if (row_start_A == row_end_A - 1) // last merge operation. No need to write output + { + // process last row + unsigned int row_index_B = A_col_buffer[row_start_A]; + return row_C_scan_symbolic_vector_1<spgemm_output_write_disabled>(B_col_buffer + B_row_buffer[row_index_B], B_col_buffer + B_row_buffer[row_index_B + 1], + row_C_vector_1, row_C_vector_1 + row_C_len, + B_size2, + row_C_vector_2); + } + else if (row_start_A + 1 < row_end_A)// at least two more rows left, so merge them + { + // process single row: + unsigned int A_col_1 = A_col_buffer[row_start_A]; + unsigned int A_col_2 = A_col_buffer[row_start_A + 1]; + unsigned int merged_len = row_C_scan_symbolic_vector_1<spgemm_output_write_enabled>(B_col_buffer + B_row_buffer[A_col_1], B_col_buffer + B_row_buffer[A_col_1 + 1], + B_col_buffer + B_row_buffer[A_col_2], B_col_buffer + B_row_buffer[A_col_2 + 1], + B_size2, + row_C_vector_3); + if (row_start_A + 2 == row_end_A) // last merge does not need a write: + return row_C_scan_symbolic_vector_1<spgemm_output_write_disabled>(row_C_vector_3, row_C_vector_3 + merged_len, + row_C_vector_1, row_C_vector_1 + row_C_len, + B_size2, + row_C_vector_2); + else + row_C_len = row_C_scan_symbolic_vector_1<spgemm_output_write_enabled>(row_C_vector_3, row_C_vector_3 + merged_len, + row_C_vector_1, row_C_vector_1 + row_C_len, + B_size2, + row_C_vector_2); + row_start_A += 2; + } + else // at least two more rows left + { + // process single row: + unsigned int row_index_B = A_col_buffer[row_start_A]; + row_C_len = row_C_scan_symbolic_vector_1<spgemm_output_write_enabled>(B_col_buffer + B_row_buffer[row_index_B], B_col_buffer + B_row_buffer[row_index_B + 1], + row_C_vector_1, row_C_vector_1 + row_C_len, + B_size2, + row_C_vector_2); + ++row_start_A; + } + + std::swap(row_C_vector_1, row_C_vector_2); + } + + return row_C_len; +} + +////////////////////////////// + +/** @brief Merges up to IndexNum rows from B into the result buffer. +* +* Because the input buffer also needs to be considered, this routine actually works on an index front of length (IndexNum+1) +**/ +template<unsigned int IndexNum, typename NumericT> +unsigned int row_C_scan_numeric_vector_N(unsigned int const *row_indices_B, NumericT const *val_A, + unsigned int const *B_row_buffer, unsigned int const *B_col_buffer, NumericT const *B_elements, unsigned int B_size2, + unsigned int const *row_C_vector_input, unsigned int const *row_C_vector_input_end, NumericT *row_C_vector_input_values, + unsigned int *row_C_vector_output, NumericT *row_C_vector_output_values) +{ + unsigned int index_front[IndexNum+1]; + unsigned int const *index_front_start[IndexNum+1]; + unsigned int const *index_front_end[IndexNum+1]; + NumericT const * value_front_start[IndexNum+1]; + NumericT values_A[IndexNum+1]; + + // Set up pointers for loading the indices: + for (unsigned int i=0; i<IndexNum; ++i, ++row_indices_B) + { + unsigned int row_B = *row_indices_B; + + index_front_start[i] = B_col_buffer + B_row_buffer[row_B]; + index_front_end[i] = B_col_buffer + B_row_buffer[row_B + 1]; + value_front_start[i] = B_elements + B_row_buffer[row_B]; + values_A[i] = val_A[i]; + } + index_front_start[IndexNum] = row_C_vector_input; + index_front_end[IndexNum] = row_C_vector_input_end; + value_front_start[IndexNum] = row_C_vector_input_values; + values_A[IndexNum] = NumericT(1); + + // load indices: + for (unsigned int i=0; i<=IndexNum; ++i) + index_front[i] = (index_front_start[i] < index_front_end[i]) ? *index_front_start[i] : B_size2; + + unsigned int *output_ptr = row_C_vector_output; + + while (1) + { + // get minimum index in current front: + unsigned int min_index_in_front = B_size2; + for (unsigned int i=0; i<=IndexNum; ++i) + min_index_in_front = std::min(min_index_in_front, index_front[i]); + + if (min_index_in_front == B_size2) // we're done + break; + + // advance index front where equal to minimum index: + NumericT row_C_value = 0; + for (unsigned int i=0; i<=IndexNum; ++i) + { + if (index_front[i] == min_index_in_front) + { + index_front_start[i] += 1; + index_front[i] = (index_front_start[i] < index_front_end[i]) ? *index_front_start[i] : B_size2; + + row_C_value += values_A[i] * *value_front_start[i]; + value_front_start[i] += 1; + } + } + + // write current entry: + *output_ptr = min_index_in_front; + ++output_ptr; + *row_C_vector_output_values = row_C_value; + ++row_C_vector_output_values; + } + + return static_cast<unsigned int>(output_ptr - row_C_vector_output); +} + + + +#ifdef VIENNACL_WITH_AVX2 +inline +unsigned int row_C_scan_numeric_vector_AVX2(int const *row_indices_B_begin, int const *row_indices_B_end, double const *values_A, + int const *B_row_buffer, int const *B_col_buffer, double const *B_elements, + int B_size2, + int *row_C_vector_output, double *row_C_vector_output_values) +{ + __m256i avx_all_ones = _mm256_set_epi32(1, 1, 1, 1, 1, 1, 1, 1); + __m256i avx_all_bsize2 = _mm256_set_epi32(B_size2, B_size2, B_size2, B_size2, B_size2, B_size2, B_size2, B_size2); + + __m256i avx_row_indices_offsets = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0); + __m256i avx_load_mask = _mm256_sub_epi32(avx_row_indices_offsets, _mm256_set1_epi32(row_indices_B_end - row_indices_B_begin)); + __m256i avx_load_mask2 = avx_load_mask; + + __m256i avx_row_indices = _mm256_set1_epi32(0); + avx_row_indices = _mm256_mask_i32gather_epi32(avx_row_indices, row_indices_B_begin, avx_row_indices_offsets, avx_load_mask, 4); + + // load values from A: + avx_load_mask = avx_load_mask2; // reload mask (destroyed by gather) + __m256d avx_value_A_low = _mm256_mask_i32gather_pd(_mm256_set_pd(0, 0, 0, 0), //src + values_A, //base ptr + _mm256_extractf128_si256(avx_row_indices_offsets, 0), //indices + _mm256_permutevar8x32_epi32(avx_load_mask, _mm256_set_epi32(3, 7, 2, 6, 1, 5, 0, 4)), 8); // mask + avx_load_mask = avx_load_mask2; // reload mask (destroyed by gather) + __m256d avx_value_A_high = _mm256_mask_i32gather_pd(_mm256_set_pd(0, 0, 0, 0), //src + values_A, //base ptr + _mm256_extractf128_si256(avx_row_indices_offsets, 1), //indices + _mm256_permutevar8x32_epi32(avx_load_mask, _mm256_set_epi32(7, 3, 6, 2, 5, 1, 4, 0)), 8); // mask + + + avx_load_mask = avx_load_mask2; // reload mask (destroyed by gather) + __m256i avx_row_start = _mm256_mask_i32gather_epi32(avx_all_ones, B_row_buffer, avx_row_indices, avx_load_mask, 4); + avx_load_mask = avx_load_mask2; // reload mask (destroyed by gather) + __m256i avx_row_end = _mm256_mask_i32gather_epi32(avx_all_ones, B_row_buffer+1, avx_row_indices, avx_load_mask, 4); + + avx_load_mask = _mm256_cmpgt_epi32(avx_row_end, avx_row_start); + avx_load_mask2 = avx_load_mask; + __m256i avx_index_front = avx_all_bsize2; + avx_index_front = _mm256_mask_i32gather_epi32(avx_index_front, B_col_buffer, avx_row_start, avx_load_mask, 4); + + // load front values from B: + avx_load_mask = avx_load_mask2; // reload mask (destroyed by gather) + __m256d avx_value_front_low = _mm256_mask_i32gather_pd(_mm256_set_pd(0, 0, 0, 0), //src + B_elements, //base ptr + _mm256_extractf128_si256(avx_row_start, 0), //indices + _mm256_permutevar8x32_epi32(avx_load_mask, _mm256_set_epi32(3, 7, 2, 6, 1, 5, 0, 4)), 8); // mask + avx_load_mask = avx_load_mask2; // reload mask (destroyed by gather) + __m256d avx_value_front_high = _mm256_mask_i32gather_pd(_mm256_set_pd(0, 0, 0, 0), //src + B_elements, //base ptr + _mm256_extractf128_si256(avx_row_start, 1), //indices + _mm256_permutevar8x32_epi32(avx_load_mask, _mm256_set_epi32(7, 3, 6, 2, 5, 1, 4, 0)), 8); // mask + + int *output_ptr = row_C_vector_output; + + while (1) + { + // get minimum index in current front: + __m256i avx_index_min1 = avx_index_front; + __m256i avx_temp = _mm256_permutevar8x32_epi32(avx_index_min1, _mm256_set_epi32(3, 2, 1, 0, 7, 6, 5, 4)); + avx_index_min1 = _mm256_min_epi32(avx_index_min1, avx_temp); // first four elements compared against last four elements + + avx_temp = _mm256_shuffle_epi32(avx_index_min1, int(78)); // 0b01001110 = 78, using shuffle instead of permutevar here because of lower latency + avx_index_min1 = _mm256_min_epi32(avx_index_min1, avx_temp); // first two elements compared against elements three and four (same for upper half of register) + + avx_temp = _mm256_shuffle_epi32(avx_index_min1, int(177)); // 0b10110001 = 177, using shuffle instead of permutevar here because of lower latency + avx_index_min1 = _mm256_min_epi32(avx_index_min1, avx_temp); // now all entries of avx_index_min1 hold the minimum + + int min_index_in_front = ((int*)&avx_index_min1)[0]; + // check for end of merge operation: + if (min_index_in_front == B_size2) + break; + + // accumulate value (can certainly be done more elegantly...) + double value = 0; + value += (min_index_in_front == ((int*)&avx_index_front)[0]) ? ((double*)&avx_value_front_low)[0] * ((double*)&avx_value_A_low)[0] : 0; + value += (min_index_in_front == ((int*)&avx_index_front)[1]) ? ((double*)&avx_value_front_low)[1] * ((double*)&avx_value_A_low)[1] : 0; + value += (min_index_in_front == ((int*)&avx_index_front)[2]) ? ((double*)&avx_value_front_low)[2] * ((double*)&avx_value_A_low)[2] : 0; + value += (min_index_in_front == ((int*)&avx_index_front)[3]) ? ((double*)&avx_value_front_low)[3] * ((double*)&avx_value_A_low)[3] : 0; + value += (min_index_in_front == ((int*)&avx_index_front)[4]) ? ((double*)&avx_value_front_high)[0] * ((double*)&avx_value_A_high)[0] : 0; + value += (min_index_in_front == ((int*)&avx_index_front)[5]) ? ((double*)&avx_value_front_high)[1] * ((double*)&avx_value_A_high)[1] : 0; + value += (min_index_in_front == ((int*)&avx_index_front)[6]) ? ((double*)&avx_value_front_high)[2] * ((double*)&avx_value_A_high)[2] : 0; + value += (min_index_in_front == ((int*)&avx_index_front)[7]) ? ((double*)&avx_value_front_high)[3] * ((double*)&avx_value_A_high)[3] : 0; + *row_C_vector_output_values = value; + ++row_C_vector_output_values; + + // write current entry: + *output_ptr = min_index_in_front; + ++output_ptr; + + // advance index front where equal to minimum index: + avx_load_mask = _mm256_cmpeq_epi32(avx_index_front, avx_index_min1); + // first part: set index to B_size2 if equal to minimum index: + avx_temp = _mm256_and_si256(avx_all_bsize2, avx_load_mask); + avx_index_front = _mm256_max_epi32(avx_index_front, avx_temp); + // second part: increment row_start registers where minimum found: + avx_temp = _mm256_and_si256(avx_all_ones, avx_load_mask); //ones only where the minimum was found + avx_row_start = _mm256_add_epi32(avx_row_start, avx_temp); + // third part part: load new data where more entries available: + avx_load_mask = _mm256_cmpgt_epi32(avx_row_end, avx_row_start); + avx_load_mask2 = avx_load_mask; + avx_index_front = _mm256_mask_i32gather_epi32(avx_index_front, B_col_buffer, avx_row_start, avx_load_mask, 4); + + // load new values where necessary: + avx_load_mask = avx_load_mask2; // reload mask (destroyed by gather) + avx_value_front_low = _mm256_mask_i32gather_pd(avx_value_front_low, //src + B_elements, //base ptr + _mm256_extractf128_si256(avx_row_start, 0), //indices + _mm256_permutevar8x32_epi32(avx_load_mask, _mm256_set_epi32(3, 7, 2, 6, 1, 5, 0, 4)), 8); // mask + + avx_load_mask = avx_load_mask2; // reload mask (destroyed by gather) + avx_value_front_high = _mm256_mask_i32gather_pd(avx_value_front_high, //src + B_elements, //base ptr + _mm256_extractf128_si256(avx_row_start, 1), //indices + _mm256_permutevar8x32_epi32(avx_load_mask, _mm256_set_epi32(7, 3, 6, 2, 5, 1, 4, 0)), 8); // mask + + //multiply new entries: + + } + + return static_cast<unsigned int>(output_ptr - row_C_vector_output); +} +#endif + + +template<typename NumericT> +unsigned int row_C_scan_numeric_vector_1(unsigned int const *input1_index_begin, unsigned int const *input1_index_end, NumericT const *input1_values_begin, NumericT factor1, + unsigned int const *input2_index_begin, unsigned int const *input2_index_end, NumericT const *input2_values_begin, NumericT factor2, + unsigned int termination_index, + unsigned int *output_index_begin, NumericT *output_values_begin) +{ + unsigned int *output_ptr = output_index_begin; + + unsigned int index1 = (input1_index_begin < input1_index_end) ? *input1_index_begin : termination_index; + unsigned int index2 = (input2_index_begin < input2_index_end) ? *input2_index_begin : termination_index; + + while (1) + { + unsigned int min_index = std::min(index1, index2); + NumericT value = 0; + + if (min_index == termination_index) + break; + + if (min_index == index1) + { + ++input1_index_begin; + index1 = (input1_index_begin < input1_index_end) ? *input1_index_begin : termination_index; + + value += factor1 * *input1_values_begin; + ++input1_values_begin; + } + + if (min_index == index2) + { + ++input2_index_begin; + index2 = (input2_index_begin < input2_index_end) ? *input2_index_begin : termination_index; + + value += factor2 * *input2_values_begin; + ++input2_values_begin; + } + + // write current entry: + *output_ptr = min_index; + ++output_ptr; + *output_values_begin = value; + ++output_values_begin; + } + + return static_cast<unsigned int>(output_ptr - output_index_begin); +} + +template<typename NumericT> +void row_C_scan_numeric_vector(unsigned int row_start_A, unsigned int row_end_A, unsigned int const *A_col_buffer, NumericT const *A_elements, + unsigned int const *B_row_buffer, unsigned int const *B_col_buffer, NumericT const *B_elements, unsigned int B_size2, + unsigned int row_start_C, unsigned int row_end_C, unsigned int *C_col_buffer, NumericT *C_elements, + unsigned int *row_C_vector_1, NumericT *row_C_vector_1_values, + unsigned int *row_C_vector_2, NumericT *row_C_vector_2_values, + unsigned int *row_C_vector_3, NumericT *row_C_vector_3_values) +{ + (void)row_end_C; + + // Trivial case: row length 0: + if (row_start_A == row_end_A) + return; + + // Trivial case: row length 1: + if (row_end_A - row_start_A == 1) + { + unsigned int A_col = A_col_buffer[row_start_A]; + unsigned int B_end = B_row_buffer[A_col + 1]; + NumericT A_value = A_elements[row_start_A]; + C_col_buffer += row_start_C; + C_elements += row_start_C; + for (unsigned int j = B_row_buffer[A_col]; j < B_end; ++j, ++C_col_buffer, ++C_elements) + { + *C_col_buffer = B_col_buffer[j]; + *C_elements = A_value * B_elements[j]; + } + return; + } + + unsigned int row_C_len = 0; + if (row_end_A - row_start_A == 2) // directly merge to C: + { + unsigned int A_col_1 = A_col_buffer[row_start_A]; + unsigned int A_col_2 = A_col_buffer[row_start_A + 1]; + + unsigned int B_offset_1 = B_row_buffer[A_col_1]; + unsigned int B_offset_2 = B_row_buffer[A_col_2]; + + row_C_scan_numeric_vector_1(B_col_buffer + B_offset_1, B_col_buffer + B_row_buffer[A_col_1+1], B_elements + B_offset_1, A_elements[row_start_A], + B_col_buffer + B_offset_2, B_col_buffer + B_row_buffer[A_col_2+1], B_elements + B_offset_2, A_elements[row_start_A + 1], + B_size2, + C_col_buffer + row_start_C, C_elements + row_start_C); + return; + } +#ifdef VIENNACL_WITH_AVX2 + else if (row_end_A - row_start_A > 10) // safely merge eight rows into temporary buffer: + { + row_C_len = row_C_scan_numeric_vector_AVX2((const int*)(A_col_buffer + row_start_A), (const int*)(A_col_buffer + row_end_A), A_elements + row_start_A, + (const int*)B_row_buffer, (const int*)B_col_buffer, B_elements, int(B_size2), + (int*)row_C_vector_1, row_C_vector_1_values); + row_start_A += 8; + } +#endif + else // safely merge two rows into temporary buffer: + { + unsigned int A_col_1 = A_col_buffer[row_start_A]; + unsigned int A_col_2 = A_col_buffer[row_start_A + 1]; + + unsigned int B_offset_1 = B_row_buffer[A_col_1]; + unsigned int B_offset_2 = B_row_buffer[A_col_2]; + + row_C_len = row_C_scan_numeric_vector_1(B_col_buffer + B_offset_1, B_col_buffer + B_row_buffer[A_col_1+1], B_elements + B_offset_1, A_elements[row_start_A], + B_col_buffer + B_offset_2, B_col_buffer + B_row_buffer[A_col_2+1], B_elements + B_offset_2, A_elements[row_start_A + 1], + B_size2, + row_C_vector_1, row_C_vector_1_values); + row_start_A += 2; + } + + // process remaining rows: + while (row_end_A > row_start_A) + { +#ifdef VIENNACL_WITH_AVX2 + if (row_end_A - row_start_A > 9) // code in other if-conditionals ensures that values get written to C + { + unsigned int merged_len = row_C_scan_numeric_vector_AVX2((const int*)(A_col_buffer + row_start_A), (const int*)(A_col_buffer + row_end_A), A_elements + row_start_A, + (const int*)B_row_buffer, (const int*)B_col_buffer, B_elements, int(B_size2), + (int*)row_C_vector_3, row_C_vector_3_values); + row_C_len = row_C_scan_numeric_vector_1(row_C_vector_3, row_C_vector_3 + merged_len, row_C_vector_3_values, NumericT(1.0), + row_C_vector_1, row_C_vector_1 + row_C_len, row_C_vector_1_values, NumericT(1.0), + B_size2, + row_C_vector_2, row_C_vector_2_values); + row_start_A += 8; + } + else +#endif + if (row_start_A + 1 == row_end_A) // last row to merge, write directly to C: + { + unsigned int A_col = A_col_buffer[row_start_A]; + unsigned int B_offset = B_row_buffer[A_col]; + + row_C_len = row_C_scan_numeric_vector_1(B_col_buffer + B_offset, B_col_buffer + B_row_buffer[A_col+1], B_elements + B_offset, A_elements[row_start_A], + row_C_vector_1, row_C_vector_1 + row_C_len, row_C_vector_1_values, NumericT(1.0), + B_size2, + C_col_buffer + row_start_C, C_elements + row_start_C); + return; + } + else if (row_start_A + 2 < row_end_A)// at least three more rows left, so merge two + { + // process single row: + unsigned int A_col_1 = A_col_buffer[row_start_A]; + unsigned int A_col_2 = A_col_buffer[row_start_A + 1]; + + unsigned int B_offset_1 = B_row_buffer[A_col_1]; + unsigned int B_offset_2 = B_row_buffer[A_col_2]; + + unsigned int merged_len = row_C_scan_numeric_vector_1(B_col_buffer + B_offset_1, B_col_buffer + B_row_buffer[A_col_1+1], B_elements + B_offset_1, A_elements[row_start_A], + B_col_buffer + B_offset_2, B_col_buffer + B_row_buffer[A_col_2+1], B_elements + B_offset_2, A_elements[row_start_A + 1], + B_size2, + row_C_vector_3, row_C_vector_3_values); + row_C_len = row_C_scan_numeric_vector_1(row_C_vector_3, row_C_vector_3 + merged_len, row_C_vector_3_values, NumericT(1.0), + row_C_vector_1, row_C_vector_1 + row_C_len, row_C_vector_1_values, NumericT(1.0), + B_size2, + row_C_vector_2, row_C_vector_2_values); + row_start_A += 2; + } + else + { + unsigned int A_col = A_col_buffer[row_start_A]; + unsigned int B_offset = B_row_buffer[A_col]; + + row_C_len = row_C_scan_numeric_vector_1(B_col_buffer + B_offset, B_col_buffer + B_row_buffer[A_col+1], B_elements + B_offset, A_elements[row_start_A], + row_C_vector_1, row_C_vector_1 + row_C_len, row_C_vector_1_values, NumericT(1.0), + B_size2, + row_C_vector_2, row_C_vector_2_values); + ++row_start_A; + } + + std::swap(row_C_vector_1, row_C_vector_2); + std::swap(row_C_vector_1_values, row_C_vector_2_values); + } +} + + +} // namespace host_based +} //namespace linalg +} //namespace viennacl + + +#endif
http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/host_based/vector_operations.hpp ---------------------------------------------------------------------- diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/host_based/vector_operations.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/host_based/vector_operations.hpp new file mode 100644 index 0000000..b4944a2 --- /dev/null +++ b/native-viennaCL/src/main/cpp/viennacl/linalg/host_based/vector_operations.hpp @@ -0,0 +1,1188 @@ +#ifndef VIENNACL_LINALG_HOST_BASED_VECTOR_OPERATIONS_HPP_ +#define VIENNACL_LINALG_HOST_BASED_VECTOR_OPERATIONS_HPP_ + +/* ========================================================================= + Copyright (c) 2010-2016, Institute for Microelectronics, + Institute for Analysis and Scientific Computing, + TU Wien. + Portions of this software are copyright by UChicago Argonne, LLC. + + ----------------- + ViennaCL - The Vienna Computing Library + ----------------- + + Project Head: Karl Rupp [email protected] + + (A list of authors and contributors can be found in the manual) + + License: MIT (X11), see file LICENSE in the base directory +============================================================================= */ + +/** @file viennacl/linalg/host_based/vector_operations.hpp + @brief Implementations of vector operations using a plain single-threaded or OpenMP-enabled execution on CPU +*/ + +#include <cmath> +#include <algorithm> //for std::max and std::min + +#include "viennacl/forwards.h" +#include "viennacl/scalar.hpp" +#include "viennacl/tools/tools.hpp" +#include "viennacl/meta/predicate.hpp" +#include "viennacl/meta/enable_if.hpp" +#include "viennacl/traits/size.hpp" +#include "viennacl/traits/start.hpp" +#include "viennacl/linalg/host_based/common.hpp" +#include "viennacl/linalg/detail/op_applier.hpp" +#include "viennacl/traits/stride.hpp" + +#ifdef VIENNACL_WITH_OPENMP +#include <omp.h> +#endif + +// Minimum vector size for using OpenMP on vector operations: +#ifndef VIENNACL_OPENMP_VECTOR_MIN_SIZE + #define VIENNACL_OPENMP_VECTOR_MIN_SIZE 5000 +#endif + +namespace viennacl +{ +namespace linalg +{ +namespace host_based +{ +namespace detail +{ + template<typename NumericT> + NumericT flip_sign(NumericT val) { return -val; } + inline unsigned long flip_sign(unsigned long val) { return val; } + inline unsigned int flip_sign(unsigned int val) { return val; } + inline unsigned short flip_sign(unsigned short val) { return val; } + inline unsigned char flip_sign(unsigned char val) { return val; } +} + +// +// Introductory note: By convention, all dimensions are already checked in the dispatcher frontend. No need to double-check again in here! +// +template<typename DestNumericT, typename SrcNumericT> +void convert(vector_base<DestNumericT> & dest, vector_base<SrcNumericT> const & src) +{ + DestNumericT * data_dest = detail::extract_raw_pointer<DestNumericT>(dest); + SrcNumericT const * data_src = detail::extract_raw_pointer<SrcNumericT>(src); + + vcl_size_t start_dest = viennacl::traits::start(dest); + vcl_size_t inc_dest = viennacl::traits::stride(dest); + vcl_size_t size_dest = viennacl::traits::size(dest); + + vcl_size_t start_src = viennacl::traits::start(src); + vcl_size_t inc_src = viennacl::traits::stride(src); + +#ifdef VIENNACL_WITH_OPENMP + #pragma omp parallel for if (size_dest > VIENNACL_OPENMP_VECTOR_MIN_SIZE) +#endif + for (long i = 0; i < static_cast<long>(size_dest); ++i) + data_dest[static_cast<vcl_size_t>(i)*inc_dest+start_dest] = static_cast<DestNumericT>(data_src[static_cast<vcl_size_t>(i)*inc_src+start_src]); +} + +template<typename NumericT, typename ScalarT1> +void av(vector_base<NumericT> & vec1, + vector_base<NumericT> const & vec2, ScalarT1 const & alpha, vcl_size_t /*len_alpha*/, bool reciprocal_alpha, bool flip_sign_alpha) +{ + typedef NumericT value_type; + + value_type * data_vec1 = detail::extract_raw_pointer<value_type>(vec1); + value_type const * data_vec2 = detail::extract_raw_pointer<value_type>(vec2); + + value_type data_alpha = alpha; + if (flip_sign_alpha) + data_alpha = detail::flip_sign(data_alpha); + + vcl_size_t start1 = viennacl::traits::start(vec1); + vcl_size_t inc1 = viennacl::traits::stride(vec1); + vcl_size_t size1 = viennacl::traits::size(vec1); + + vcl_size_t start2 = viennacl::traits::start(vec2); + vcl_size_t inc2 = viennacl::traits::stride(vec2); + + if (reciprocal_alpha) + { +#ifdef VIENNACL_WITH_OPENMP + #pragma omp parallel for if (size1 > VIENNACL_OPENMP_VECTOR_MIN_SIZE) +#endif + for (long i = 0; i < static_cast<long>(size1); ++i) + data_vec1[static_cast<vcl_size_t>(i)*inc1+start1] = data_vec2[static_cast<vcl_size_t>(i)*inc2+start2] / data_alpha; + } + else + { +#ifdef VIENNACL_WITH_OPENMP + #pragma omp parallel for if (size1 > VIENNACL_OPENMP_VECTOR_MIN_SIZE) +#endif + for (long i = 0; i < static_cast<long>(size1); ++i) + data_vec1[static_cast<vcl_size_t>(i)*inc1+start1] = data_vec2[static_cast<vcl_size_t>(i)*inc2+start2] * data_alpha; + } +} + + +template<typename NumericT, typename ScalarT1, typename ScalarT2> +void avbv(vector_base<NumericT> & vec1, + vector_base<NumericT> const & vec2, ScalarT1 const & alpha, vcl_size_t /* len_alpha */, bool reciprocal_alpha, bool flip_sign_alpha, + vector_base<NumericT> const & vec3, ScalarT2 const & beta, vcl_size_t /* len_beta */, bool reciprocal_beta, bool flip_sign_beta) +{ + typedef NumericT value_type; + + value_type * data_vec1 = detail::extract_raw_pointer<value_type>(vec1); + value_type const * data_vec2 = detail::extract_raw_pointer<value_type>(vec2); + value_type const * data_vec3 = detail::extract_raw_pointer<value_type>(vec3); + + value_type data_alpha = alpha; + if (flip_sign_alpha) + data_alpha = detail::flip_sign(data_alpha); + + value_type data_beta = beta; + if (flip_sign_beta) + data_beta = detail::flip_sign(data_beta); + + vcl_size_t start1 = viennacl::traits::start(vec1); + vcl_size_t inc1 = viennacl::traits::stride(vec1); + vcl_size_t size1 = viennacl::traits::size(vec1); + + vcl_size_t start2 = viennacl::traits::start(vec2); + vcl_size_t inc2 = viennacl::traits::stride(vec2); + + vcl_size_t start3 = viennacl::traits::start(vec3); + vcl_size_t inc3 = viennacl::traits::stride(vec3); + + if (reciprocal_alpha) + { + if (reciprocal_beta) + { +#ifdef VIENNACL_WITH_OPENMP + #pragma omp parallel for if (size1 > VIENNACL_OPENMP_VECTOR_MIN_SIZE) +#endif + for (long i = 0; i < static_cast<long>(size1); ++i) + data_vec1[static_cast<vcl_size_t>(i)*inc1+start1] = data_vec2[static_cast<vcl_size_t>(i)*inc2+start2] / data_alpha + data_vec3[static_cast<vcl_size_t>(i)*inc3+start3] / data_beta; + } + else + { +#ifdef VIENNACL_WITH_OPENMP + #pragma omp parallel for if (size1 > VIENNACL_OPENMP_VECTOR_MIN_SIZE) +#endif + for (long i = 0; i < static_cast<long>(size1); ++i) + data_vec1[static_cast<vcl_size_t>(i)*inc1+start1] = data_vec2[static_cast<vcl_size_t>(i)*inc2+start2] / data_alpha + data_vec3[static_cast<vcl_size_t>(i)*inc3+start3] * data_beta; + } + } + else + { + if (reciprocal_beta) + { +#ifdef VIENNACL_WITH_OPENMP + #pragma omp parallel for if (size1 > VIENNACL_OPENMP_VECTOR_MIN_SIZE) +#endif + for (long i = 0; i < static_cast<long>(size1); ++i) + data_vec1[static_cast<vcl_size_t>(i)*inc1+start1] = data_vec2[static_cast<vcl_size_t>(i)*inc2+start2] * data_alpha + data_vec3[static_cast<vcl_size_t>(i)*inc3+start3] / data_beta; + } + else + { +#ifdef VIENNACL_WITH_OPENMP + #pragma omp parallel for if (size1 > VIENNACL_OPENMP_VECTOR_MIN_SIZE) +#endif + for (long i = 0; i < static_cast<long>(size1); ++i) + data_vec1[static_cast<vcl_size_t>(i)*inc1+start1] = data_vec2[static_cast<vcl_size_t>(i)*inc2+start2] * data_alpha + data_vec3[static_cast<vcl_size_t>(i)*inc3+start3] * data_beta; + } + } +} + + +template<typename NumericT, typename ScalarT1, typename ScalarT2> +void avbv_v(vector_base<NumericT> & vec1, + vector_base<NumericT> const & vec2, ScalarT1 const & alpha, vcl_size_t /*len_alpha*/, bool reciprocal_alpha, bool flip_sign_alpha, + vector_base<NumericT> const & vec3, ScalarT2 const & beta, vcl_size_t /*len_beta*/, bool reciprocal_beta, bool flip_sign_beta) +{ + typedef NumericT value_type; + + value_type * data_vec1 = detail::extract_raw_pointer<value_type>(vec1); + value_type const * data_vec2 = detail::extract_raw_pointer<value_type>(vec2); + value_type const * data_vec3 = detail::extract_raw_pointer<value_type>(vec3); + + value_type data_alpha = alpha; + if (flip_sign_alpha) + data_alpha = detail::flip_sign(data_alpha); + + value_type data_beta = beta; + if (flip_sign_beta) + data_beta = detail::flip_sign(data_beta); + + vcl_size_t start1 = viennacl::traits::start(vec1); + vcl_size_t inc1 = viennacl::traits::stride(vec1); + vcl_size_t size1 = viennacl::traits::size(vec1); + + vcl_size_t start2 = viennacl::traits::start(vec2); + vcl_size_t inc2 = viennacl::traits::stride(vec2); + + vcl_size_t start3 = viennacl::traits::start(vec3); + vcl_size_t inc3 = viennacl::traits::stride(vec3); + + if (reciprocal_alpha) + { + if (reciprocal_beta) + { +#ifdef VIENNACL_WITH_OPENMP + #pragma omp parallel for if (size1 > VIENNACL_OPENMP_VECTOR_MIN_SIZE) +#endif + for (long i = 0; i < static_cast<long>(size1); ++i) + data_vec1[static_cast<vcl_size_t>(i)*inc1+start1] += data_vec2[static_cast<vcl_size_t>(i)*inc2+start2] / data_alpha + data_vec3[static_cast<vcl_size_t>(i)*inc3+start3] / data_beta; + } + else + { +#ifdef VIENNACL_WITH_OPENMP + #pragma omp parallel for if (size1 > VIENNACL_OPENMP_VECTOR_MIN_SIZE) +#endif + for (long i = 0; i < static_cast<long>(size1); ++i) + data_vec1[static_cast<vcl_size_t>(i)*inc1+start1] += data_vec2[static_cast<vcl_size_t>(i)*inc2+start2] / data_alpha + data_vec3[static_cast<vcl_size_t>(i)*inc3+start3] * data_beta; + } + } + else + { + if (reciprocal_beta) + { +#ifdef VIENNACL_WITH_OPENMP + #pragma omp parallel for if (size1 > VIENNACL_OPENMP_VECTOR_MIN_SIZE) +#endif + for (long i = 0; i < static_cast<long>(size1); ++i) + data_vec1[static_cast<vcl_size_t>(i)*inc1+start1] += data_vec2[static_cast<vcl_size_t>(i)*inc2+start2] * data_alpha + data_vec3[static_cast<vcl_size_t>(i)*inc3+start3] / data_beta; + } + else + { +#ifdef VIENNACL_WITH_OPENMP + #pragma omp parallel for if (size1 > VIENNACL_OPENMP_VECTOR_MIN_SIZE) +#endif + for (long i = 0; i < static_cast<long>(size1); ++i) + data_vec1[static_cast<vcl_size_t>(i)*inc1+start1] += data_vec2[static_cast<vcl_size_t>(i)*inc2+start2] * data_alpha + data_vec3[static_cast<vcl_size_t>(i)*inc3+start3] * data_beta; + } + } +} + + + + +/** @brief Assign a constant value to a vector (-range/-slice) +* +* @param vec1 The vector to which the value should be assigned +* @param alpha The value to be assigned +* @param up_to_internal_size Specifies whether alpha should also be written to padded memory (mostly used for clearing the whole buffer). +*/ +template<typename NumericT> +void vector_assign(vector_base<NumericT> & vec1, const NumericT & alpha, bool up_to_internal_size = false) +{ + typedef NumericT value_type; + + value_type * data_vec1 = detail::extract_raw_pointer<value_type>(vec1); + + vcl_size_t start1 = viennacl::traits::start(vec1); + vcl_size_t inc1 = viennacl::traits::stride(vec1); + vcl_size_t size1 = viennacl::traits::size(vec1); + vcl_size_t loop_bound = up_to_internal_size ? vec1.internal_size() : size1; //Note: Do NOT use traits::internal_size() here, because vector proxies don't require padding. + + value_type data_alpha = static_cast<value_type>(alpha); + +#ifdef VIENNACL_WITH_OPENMP + #pragma omp parallel for if (loop_bound > VIENNACL_OPENMP_VECTOR_MIN_SIZE) +#endif + for (long i = 0; i < static_cast<long>(loop_bound); ++i) + data_vec1[static_cast<vcl_size_t>(i)*inc1+start1] = data_alpha; +} + + +/** @brief Swaps the contents of two vectors, data is copied +* +* @param vec1 The first vector (or -range, or -slice) +* @param vec2 The second vector (or -range, or -slice) +*/ +template<typename NumericT> +void vector_swap(vector_base<NumericT> & vec1, vector_base<NumericT> & vec2) +{ + typedef NumericT value_type; + + value_type * data_vec1 = detail::extract_raw_pointer<value_type>(vec1); + value_type * data_vec2 = detail::extract_raw_pointer<value_type>(vec2); + + vcl_size_t start1 = viennacl::traits::start(vec1); + vcl_size_t inc1 = viennacl::traits::stride(vec1); + vcl_size_t size1 = viennacl::traits::size(vec1); + + vcl_size_t start2 = viennacl::traits::start(vec2); + vcl_size_t inc2 = viennacl::traits::stride(vec2); + +#ifdef VIENNACL_WITH_OPENMP + #pragma omp parallel for if (size1 > VIENNACL_OPENMP_VECTOR_MIN_SIZE) +#endif + for (long i = 0; i < static_cast<long>(size1); ++i) + { + value_type temp = data_vec2[static_cast<vcl_size_t>(i)*inc2+start2]; + data_vec2[static_cast<vcl_size_t>(i)*inc2+start2] = data_vec1[static_cast<vcl_size_t>(i)*inc1+start1]; + data_vec1[static_cast<vcl_size_t>(i)*inc1+start1] = temp; + } +} + + +///////////////////////// Elementwise operations ///////////// + +/** @brief Implementation of the element-wise operation v1 = v2 .* v3 and v1 = v2 ./ v3 (using MATLAB syntax) +* +* @param vec1 The result vector (or -range, or -slice) +* @param proxy The proxy object holding v2, v3 and the operation +*/ +template<typename NumericT, typename OpT> +void element_op(vector_base<NumericT> & vec1, + vector_expression<const vector_base<NumericT>, const vector_base<NumericT>, op_element_binary<OpT> > const & proxy) +{ + typedef NumericT value_type; + typedef viennacl::linalg::detail::op_applier<op_element_binary<OpT> > OpFunctor; + + value_type * data_vec1 = detail::extract_raw_pointer<value_type>(vec1); + value_type const * data_vec2 = detail::extract_raw_pointer<value_type>(proxy.lhs()); + value_type const * data_vec3 = detail::extract_raw_pointer<value_type>(proxy.rhs()); + + vcl_size_t start1 = viennacl::traits::start(vec1); + vcl_size_t inc1 = viennacl::traits::stride(vec1); + vcl_size_t size1 = viennacl::traits::size(vec1); + + vcl_size_t start2 = viennacl::traits::start(proxy.lhs()); + vcl_size_t inc2 = viennacl::traits::stride(proxy.lhs()); + + vcl_size_t start3 = viennacl::traits::start(proxy.rhs()); + vcl_size_t inc3 = viennacl::traits::stride(proxy.rhs()); + +#ifdef VIENNACL_WITH_OPENMP + #pragma omp parallel for if (size1 > VIENNACL_OPENMP_VECTOR_MIN_SIZE) +#endif + for (long i = 0; i < static_cast<long>(size1); ++i) + OpFunctor::apply(data_vec1[static_cast<vcl_size_t>(i)*inc1+start1], data_vec2[static_cast<vcl_size_t>(i)*inc2+start2], data_vec3[static_cast<vcl_size_t>(i)*inc3+start3]); +} + +/** @brief Implementation of the element-wise operation v1 = v2 .* v3 and v1 = v2 ./ v3 (using MATLAB syntax) +* +* @param vec1 The result vector (or -range, or -slice) +* @param proxy The proxy object holding v2, v3 and the operation +*/ +template<typename NumericT, typename OpT> +void element_op(vector_base<NumericT> & vec1, + vector_expression<const vector_base<NumericT>, const vector_base<NumericT>, op_element_unary<OpT> > const & proxy) +{ + typedef NumericT value_type; + typedef viennacl::linalg::detail::op_applier<op_element_unary<OpT> > OpFunctor; + + value_type * data_vec1 = detail::extract_raw_pointer<value_type>(vec1); + value_type const * data_vec2 = detail::extract_raw_pointer<value_type>(proxy.lhs()); + + vcl_size_t start1 = viennacl::traits::start(vec1); + vcl_size_t inc1 = viennacl::traits::stride(vec1); + vcl_size_t size1 = viennacl::traits::size(vec1); + + vcl_size_t start2 = viennacl::traits::start(proxy.lhs()); + vcl_size_t inc2 = viennacl::traits::stride(proxy.lhs()); + +#ifdef VIENNACL_WITH_OPENMP + #pragma omp parallel for if (size1 > VIENNACL_OPENMP_VECTOR_MIN_SIZE) +#endif + for (long i = 0; i < static_cast<long>(size1); ++i) + OpFunctor::apply(data_vec1[static_cast<vcl_size_t>(i)*inc1+start1], data_vec2[static_cast<vcl_size_t>(i)*inc2+start2]); +} + + +///////////////////////// Norms and inner product /////////////////// + + +//implementation of inner product: + +namespace detail +{ + +// the following circumvents problems when trying to use a variable of template parameter type for a reduction. +// Such a behavior is not covered by the OpenMP standard, hence we manually apply some preprocessor magic to resolve the problem. +// See https://github.com/viennacl/viennacl-dev/issues/112 for a detailed explanation and discussion. + +#define VIENNACL_INNER_PROD_IMPL_1(RESULTSCALART, TEMPSCALART) \ + inline RESULTSCALART inner_prod_impl(RESULTSCALART const * data_vec1, vcl_size_t start1, vcl_size_t inc1, vcl_size_t size1, \ + RESULTSCALART const * data_vec2, vcl_size_t start2, vcl_size_t inc2) { \ + TEMPSCALART temp = 0; + +#define VIENNACL_INNER_PROD_IMPL_2(RESULTSCALART) \ + for (long i = 0; i < static_cast<long>(size1); ++i) \ + temp += data_vec1[static_cast<vcl_size_t>(i)*inc1+start1] * data_vec2[static_cast<vcl_size_t>(i)*inc2+start2]; \ + return static_cast<RESULTSCALART>(temp); \ + } + +// char +VIENNACL_INNER_PROD_IMPL_1(char, int) +#ifdef VIENNACL_WITH_OPENMP + #pragma omp parallel for reduction(+: temp) if (size1 > VIENNACL_OPENMP_VECTOR_MIN_SIZE) +#endif +VIENNACL_INNER_PROD_IMPL_2(char) + +VIENNACL_INNER_PROD_IMPL_1(unsigned char, int) +#ifdef VIENNACL_WITH_OPENMP + #pragma omp parallel for reduction(+: temp) if (size1 > VIENNACL_OPENMP_VECTOR_MIN_SIZE) +#endif +VIENNACL_INNER_PROD_IMPL_2(unsigned char) + + +// short +VIENNACL_INNER_PROD_IMPL_1(short, int) +#ifdef VIENNACL_WITH_OPENMP + #pragma omp parallel for reduction(+: temp) if (size1 > VIENNACL_OPENMP_VECTOR_MIN_SIZE) +#endif +VIENNACL_INNER_PROD_IMPL_2(short) + +VIENNACL_INNER_PROD_IMPL_1(unsigned short, int) +#ifdef VIENNACL_WITH_OPENMP + #pragma omp parallel for reduction(+: temp) if (size1 > VIENNACL_OPENMP_VECTOR_MIN_SIZE) +#endif +VIENNACL_INNER_PROD_IMPL_2(unsigned short) + + +// int +VIENNACL_INNER_PROD_IMPL_1(int, int) +#ifdef VIENNACL_WITH_OPENMP + #pragma omp parallel for reduction(+: temp) if (size1 > VIENNACL_OPENMP_VECTOR_MIN_SIZE) +#endif +VIENNACL_INNER_PROD_IMPL_2(int) + +VIENNACL_INNER_PROD_IMPL_1(unsigned int, unsigned int) +#ifdef VIENNACL_WITH_OPENMP + #pragma omp parallel for reduction(+: temp) if (size1 > VIENNACL_OPENMP_VECTOR_MIN_SIZE) +#endif +VIENNACL_INNER_PROD_IMPL_2(unsigned int) + + +// long +VIENNACL_INNER_PROD_IMPL_1(long, long) +#ifdef VIENNACL_WITH_OPENMP + #pragma omp parallel for reduction(+: temp) if (size1 > VIENNACL_OPENMP_VECTOR_MIN_SIZE) +#endif +VIENNACL_INNER_PROD_IMPL_2(long) + +VIENNACL_INNER_PROD_IMPL_1(unsigned long, unsigned long) +#ifdef VIENNACL_WITH_OPENMP + #pragma omp parallel for reduction(+: temp) if (size1 > VIENNACL_OPENMP_VECTOR_MIN_SIZE) +#endif +VIENNACL_INNER_PROD_IMPL_2(unsigned long) + + +// float +VIENNACL_INNER_PROD_IMPL_1(float, float) +#ifdef VIENNACL_WITH_OPENMP + #pragma omp parallel for reduction(+: temp) if (size1 > VIENNACL_OPENMP_VECTOR_MIN_SIZE) +#endif +VIENNACL_INNER_PROD_IMPL_2(float) + +// double +VIENNACL_INNER_PROD_IMPL_1(double, double) +#ifdef VIENNACL_WITH_OPENMP + #pragma omp parallel for reduction(+: temp) if (size1 > VIENNACL_OPENMP_VECTOR_MIN_SIZE) +#endif +VIENNACL_INNER_PROD_IMPL_2(double) + +#undef VIENNACL_INNER_PROD_IMPL_1 +#undef VIENNACL_INNER_PROD_IMPL_2 +} + +/** @brief Computes the inner product of two vectors - implementation. Library users should call inner_prod(vec1, vec2). +* +* @param vec1 The first vector +* @param vec2 The second vector +* @param result The result scalar (on the gpu) +*/ +template<typename NumericT, typename ScalarT> +void inner_prod_impl(vector_base<NumericT> const & vec1, + vector_base<NumericT> const & vec2, + ScalarT & result) +{ + typedef NumericT value_type; + + value_type const * data_vec1 = detail::extract_raw_pointer<value_type>(vec1); + value_type const * data_vec2 = detail::extract_raw_pointer<value_type>(vec2); + + vcl_size_t start1 = viennacl::traits::start(vec1); + vcl_size_t inc1 = viennacl::traits::stride(vec1); + vcl_size_t size1 = viennacl::traits::size(vec1); + + vcl_size_t start2 = viennacl::traits::start(vec2); + vcl_size_t inc2 = viennacl::traits::stride(vec2); + + result = detail::inner_prod_impl(data_vec1, start1, inc1, size1, + data_vec2, start2, inc2); //Note: Assignment to result might be expensive, thus a temporary is introduced here +} + +template<typename NumericT> +void inner_prod_impl(vector_base<NumericT> const & x, + vector_tuple<NumericT> const & vec_tuple, + vector_base<NumericT> & result) +{ + typedef NumericT value_type; + + value_type const * data_x = detail::extract_raw_pointer<value_type>(x); + + vcl_size_t start_x = viennacl::traits::start(x); + vcl_size_t inc_x = viennacl::traits::stride(x); + vcl_size_t size_x = viennacl::traits::size(x); + + std::vector<value_type> temp(vec_tuple.const_size()); + std::vector<value_type const *> data_y(vec_tuple.const_size()); + std::vector<vcl_size_t> start_y(vec_tuple.const_size()); + std::vector<vcl_size_t> stride_y(vec_tuple.const_size()); + + for (vcl_size_t j=0; j<vec_tuple.const_size(); ++j) + { + data_y[j] = detail::extract_raw_pointer<value_type>(vec_tuple.const_at(j)); + start_y[j] = viennacl::traits::start(vec_tuple.const_at(j)); + stride_y[j] = viennacl::traits::stride(vec_tuple.const_at(j)); + } + + // Note: No OpenMP here because it cannot perform a reduction on temp-array. Savings in memory bandwidth are expected to still justify this approach... + for (vcl_size_t i = 0; i < size_x; ++i) + { + value_type entry_x = data_x[i*inc_x+start_x]; + for (vcl_size_t j=0; j < vec_tuple.const_size(); ++j) + temp[j] += entry_x * data_y[j][i*stride_y[j]+start_y[j]]; + } + + for (vcl_size_t j=0; j < vec_tuple.const_size(); ++j) + result[j] = temp[j]; //Note: Assignment to result might be expensive, thus 'temp' is used for accumulation +} + + +namespace detail +{ + +#define VIENNACL_NORM_1_IMPL_1(RESULTSCALART, TEMPSCALART) \ + inline RESULTSCALART norm_1_impl(RESULTSCALART const * data_vec1, vcl_size_t start1, vcl_size_t inc1, vcl_size_t size1) { \ + TEMPSCALART temp = 0; + +#define VIENNACL_NORM_1_IMPL_2(RESULTSCALART, TEMPSCALART) \ + for (long i = 0; i < static_cast<long>(size1); ++i) \ + temp += static_cast<TEMPSCALART>(std::fabs(static_cast<double>(data_vec1[static_cast<vcl_size_t>(i)*inc1+start1]))); \ + return static_cast<RESULTSCALART>(temp); \ + } + +// char +VIENNACL_NORM_1_IMPL_1(char, int) +#ifdef VIENNACL_WITH_OPENMP + #pragma omp parallel for reduction(+: temp) if (size1 > VIENNACL_OPENMP_VECTOR_MIN_SIZE) +#endif +VIENNACL_NORM_1_IMPL_2(char, int) + +VIENNACL_NORM_1_IMPL_1(unsigned char, int) +#ifdef VIENNACL_WITH_OPENMP + #pragma omp parallel for reduction(+: temp) if (size1 > VIENNACL_OPENMP_VECTOR_MIN_SIZE) +#endif +VIENNACL_NORM_1_IMPL_2(unsigned char, int) + +// short +VIENNACL_NORM_1_IMPL_1(short, int) +#ifdef VIENNACL_WITH_OPENMP + #pragma omp parallel for reduction(+: temp) if (size1 > VIENNACL_OPENMP_VECTOR_MIN_SIZE) +#endif +VIENNACL_NORM_1_IMPL_2(short, int) + +VIENNACL_NORM_1_IMPL_1(unsigned short, int) +#ifdef VIENNACL_WITH_OPENMP + #pragma omp parallel for reduction(+: temp) if (size1 > VIENNACL_OPENMP_VECTOR_MIN_SIZE) +#endif +VIENNACL_NORM_1_IMPL_2(unsigned short, int) + + +// int +VIENNACL_NORM_1_IMPL_1(int, int) +#ifdef VIENNACL_WITH_OPENMP + #pragma omp parallel for reduction(+: temp) if (size1 > VIENNACL_OPENMP_VECTOR_MIN_SIZE) +#endif +VIENNACL_NORM_1_IMPL_2(int, int) + +VIENNACL_NORM_1_IMPL_1(unsigned int, unsigned int) +#ifdef VIENNACL_WITH_OPENMP + #pragma omp parallel for reduction(+: temp) if (size1 > VIENNACL_OPENMP_VECTOR_MIN_SIZE) +#endif +VIENNACL_NORM_1_IMPL_2(unsigned int, unsigned int) + + +// long +VIENNACL_NORM_1_IMPL_1(long, long) +#ifdef VIENNACL_WITH_OPENMP + #pragma omp parallel for reduction(+: temp) if (size1 > VIENNACL_OPENMP_VECTOR_MIN_SIZE) +#endif +VIENNACL_NORM_1_IMPL_2(long, long) + +VIENNACL_NORM_1_IMPL_1(unsigned long, unsigned long) +#ifdef VIENNACL_WITH_OPENMP + #pragma omp parallel for reduction(+: temp) if (size1 > VIENNACL_OPENMP_VECTOR_MIN_SIZE) +#endif +VIENNACL_NORM_1_IMPL_2(unsigned long, unsigned long) + + +// float +VIENNACL_NORM_1_IMPL_1(float, float) +#ifdef VIENNACL_WITH_OPENMP + #pragma omp parallel for reduction(+: temp) if (size1 > VIENNACL_OPENMP_VECTOR_MIN_SIZE) +#endif +VIENNACL_NORM_1_IMPL_2(float, float) + +// double +VIENNACL_NORM_1_IMPL_1(double, double) +#ifdef VIENNACL_WITH_OPENMP + #pragma omp parallel for reduction(+: temp) if (size1 > VIENNACL_OPENMP_VECTOR_MIN_SIZE) +#endif +VIENNACL_NORM_1_IMPL_2(double, double) + +#undef VIENNACL_NORM_1_IMPL_1 +#undef VIENNACL_NORM_1_IMPL_2 + +} + +/** @brief Computes the l^1-norm of a vector +* +* @param vec1 The vector +* @param result The result scalar +*/ +template<typename NumericT, typename ScalarT> +void norm_1_impl(vector_base<NumericT> const & vec1, + ScalarT & result) +{ + typedef NumericT value_type; + + value_type const * data_vec1 = detail::extract_raw_pointer<value_type>(vec1); + + vcl_size_t start1 = viennacl::traits::start(vec1); + vcl_size_t inc1 = viennacl::traits::stride(vec1); + vcl_size_t size1 = viennacl::traits::size(vec1); + + result = detail::norm_1_impl(data_vec1, start1, inc1, size1); //Note: Assignment to result might be expensive, thus using a temporary for accumulation +} + + + +namespace detail +{ + +#define VIENNACL_NORM_2_IMPL_1(RESULTSCALART, TEMPSCALART) \ + inline RESULTSCALART norm_2_impl(RESULTSCALART const * data_vec1, vcl_size_t start1, vcl_size_t inc1, vcl_size_t size1) { \ + TEMPSCALART temp = 0; + +#define VIENNACL_NORM_2_IMPL_2(RESULTSCALART, TEMPSCALART) \ + for (long i = 0; i < static_cast<long>(size1); ++i) { \ + RESULTSCALART data = data_vec1[static_cast<vcl_size_t>(i)*inc1+start1]; \ + temp += static_cast<TEMPSCALART>(data * data); \ + } \ + return static_cast<RESULTSCALART>(temp); \ + } + +// char +VIENNACL_NORM_2_IMPL_1(char, int) +#ifdef VIENNACL_WITH_OPENMP + #pragma omp parallel for reduction(+: temp) if (size1 > VIENNACL_OPENMP_VECTOR_MIN_SIZE) +#endif +VIENNACL_NORM_2_IMPL_2(char, int) + +VIENNACL_NORM_2_IMPL_1(unsigned char, int) +#ifdef VIENNACL_WITH_OPENMP + #pragma omp parallel for reduction(+: temp) if (size1 > VIENNACL_OPENMP_VECTOR_MIN_SIZE) +#endif +VIENNACL_NORM_2_IMPL_2(unsigned char, int) + + +// short +VIENNACL_NORM_2_IMPL_1(short, int) +#ifdef VIENNACL_WITH_OPENMP + #pragma omp parallel for reduction(+: temp) if (size1 > VIENNACL_OPENMP_VECTOR_MIN_SIZE) +#endif +VIENNACL_NORM_2_IMPL_2(short, int) + +VIENNACL_NORM_2_IMPL_1(unsigned short, int) +#ifdef VIENNACL_WITH_OPENMP + #pragma omp parallel for reduction(+: temp) if (size1 > VIENNACL_OPENMP_VECTOR_MIN_SIZE) +#endif +VIENNACL_NORM_2_IMPL_2(unsigned short, int) + + +// int +VIENNACL_NORM_2_IMPL_1(int, int) +#ifdef VIENNACL_WITH_OPENMP + #pragma omp parallel for reduction(+: temp) if (size1 > VIENNACL_OPENMP_VECTOR_MIN_SIZE) +#endif +VIENNACL_NORM_2_IMPL_2(int, int) + +VIENNACL_NORM_2_IMPL_1(unsigned int, unsigned int) +#ifdef VIENNACL_WITH_OPENMP + #pragma omp parallel for reduction(+: temp) if (size1 > VIENNACL_OPENMP_VECTOR_MIN_SIZE) +#endif +VIENNACL_NORM_2_IMPL_2(unsigned int, unsigned int) + + +// long +VIENNACL_NORM_2_IMPL_1(long, long) +#ifdef VIENNACL_WITH_OPENMP + #pragma omp parallel for reduction(+: temp) if (size1 > VIENNACL_OPENMP_VECTOR_MIN_SIZE) +#endif +VIENNACL_NORM_2_IMPL_2(long, long) + +VIENNACL_NORM_2_IMPL_1(unsigned long, unsigned long) +#ifdef VIENNACL_WITH_OPENMP + #pragma omp parallel for reduction(+: temp) if (size1 > VIENNACL_OPENMP_VECTOR_MIN_SIZE) +#endif +VIENNACL_NORM_2_IMPL_2(unsigned long, unsigned long) + + +// float +VIENNACL_NORM_2_IMPL_1(float, float) +#ifdef VIENNACL_WITH_OPENMP + #pragma omp parallel for reduction(+: temp) if (size1 > VIENNACL_OPENMP_VECTOR_MIN_SIZE) +#endif +VIENNACL_NORM_2_IMPL_2(float, float) + +// double +VIENNACL_NORM_2_IMPL_1(double, double) +#ifdef VIENNACL_WITH_OPENMP + #pragma omp parallel for reduction(+: temp) if (size1 > VIENNACL_OPENMP_VECTOR_MIN_SIZE) +#endif +VIENNACL_NORM_2_IMPL_2(double, double) + +#undef VIENNACL_NORM_2_IMPL_1 +#undef VIENNACL_NORM_2_IMPL_2 + +} + + +/** @brief Computes the l^2-norm of a vector - implementation +* +* @param vec1 The vector +* @param result The result scalar +*/ +template<typename NumericT, typename ScalarT> +void norm_2_impl(vector_base<NumericT> const & vec1, + ScalarT & result) +{ + typedef NumericT value_type; + + value_type const * data_vec1 = detail::extract_raw_pointer<value_type>(vec1); + + vcl_size_t start1 = viennacl::traits::start(vec1); + vcl_size_t inc1 = viennacl::traits::stride(vec1); + vcl_size_t size1 = viennacl::traits::size(vec1); + + result = std::sqrt(detail::norm_2_impl(data_vec1, start1, inc1, size1)); //Note: Assignment to result might be expensive, thus 'temp' is used for accumulation +} + +/** @brief Computes the supremum-norm of a vector +* +* @param vec1 The vector +* @param result The result scalar +*/ +template<typename NumericT, typename ScalarT> +void norm_inf_impl(vector_base<NumericT> const & vec1, + ScalarT & result) +{ + typedef NumericT value_type; + + value_type const * data_vec1 = detail::extract_raw_pointer<value_type>(vec1); + + vcl_size_t start1 = viennacl::traits::start(vec1); + vcl_size_t inc1 = viennacl::traits::stride(vec1); + vcl_size_t size1 = viennacl::traits::size(vec1); + + vcl_size_t thread_count=1; + + #ifdef VIENNACL_WITH_OPENMP + if(size1 > VIENNACL_OPENMP_VECTOR_MIN_SIZE) + thread_count = omp_get_max_threads(); + #endif + + std::vector<value_type> temp(thread_count); + +#ifdef VIENNACL_WITH_OPENMP + #pragma omp parallel if (size1 > VIENNACL_OPENMP_VECTOR_MIN_SIZE) +#endif + { + vcl_size_t id = 0; +#ifdef VIENNACL_WITH_OPENMP + id = omp_get_thread_num(); +#endif + + vcl_size_t begin = (size1 * id) / thread_count; + vcl_size_t end = (size1 * (id + 1)) / thread_count; + temp[id] = 0; + + for (vcl_size_t i = begin; i < end; ++i) + temp[id] = std::max<value_type>(temp[id], static_cast<value_type>(std::fabs(static_cast<double>(data_vec1[i*inc1+start1])))); //casting to double in order to avoid problems if T is an integer type + } + for (vcl_size_t i = 1; i < thread_count; ++i) + temp[0] = std::max<value_type>( temp[0], temp[i]); + result = temp[0]; +} + +//This function should return a CPU scalar, otherwise statements like +// vcl_rhs[index_norm_inf(vcl_rhs)] +// are ambiguous +/** @brief Computes the index of the first entry that is equal to the supremum-norm in modulus. +* +* @param vec1 The vector +* @return The result. Note that the result must be a CPU scalar (unsigned int), since gpu scalars are floating point types. +*/ +template<typename NumericT> +vcl_size_t index_norm_inf(vector_base<NumericT> const & vec1) +{ + typedef NumericT value_type; + + value_type const * data_vec1 = detail::extract_raw_pointer<value_type>(vec1); + + vcl_size_t start1 = viennacl::traits::start(vec1); + vcl_size_t inc1 = viennacl::traits::stride(vec1); + vcl_size_t size1 = viennacl::traits::size(vec1); + vcl_size_t thread_count=1; + +#ifdef VIENNACL_WITH_OPENMP + if(size1 > VIENNACL_OPENMP_VECTOR_MIN_SIZE) + thread_count = omp_get_max_threads(); +#endif + + std::vector<value_type> temp(thread_count); + std::vector<vcl_size_t> index(thread_count); + +#ifdef VIENNACL_WITH_OPENMP + #pragma omp parallel if (size1 > VIENNACL_OPENMP_VECTOR_MIN_SIZE) +#endif + { + vcl_size_t id = 0; +#ifdef VIENNACL_WITH_OPENMP + id = omp_get_thread_num(); +#endif + vcl_size_t begin = (size1 * id) / thread_count; + vcl_size_t end = (size1 * (id + 1)) / thread_count; + index[id] = start1; + temp[id] = 0; + value_type data; + + for (vcl_size_t i = begin; i < end; ++i) + { + data = static_cast<value_type>(std::fabs(static_cast<double>(data_vec1[i*inc1+start1]))); //casting to double in order to avoid problems if T is an integer type + if (data > temp[id]) + { + index[id] = i; + temp[id] = data; + } + } + } + for (vcl_size_t i = 1; i < thread_count; ++i) + { + if (temp[i] > temp[0]) + { + index[0] = index[i]; + temp[0] = temp[i]; + } + } + return index[0]; +} + +/** @brief Computes the maximum of a vector +* +* @param vec1 The vector +* @param result The result scalar +*/ +template<typename NumericT, typename ScalarT> +void max_impl(vector_base<NumericT> const & vec1, + ScalarT & result) +{ + typedef NumericT value_type; + + value_type const * data_vec1 = detail::extract_raw_pointer<value_type>(vec1); + + vcl_size_t start1 = viennacl::traits::start(vec1); + vcl_size_t inc1 = viennacl::traits::stride(vec1); + vcl_size_t size1 = viennacl::traits::size(vec1); + + vcl_size_t thread_count=1; + +#ifdef VIENNACL_WITH_OPENMP + if(size1 > VIENNACL_OPENMP_VECTOR_MIN_SIZE) + thread_count = omp_get_max_threads(); +#endif + + std::vector<value_type> temp(thread_count); + +#ifdef VIENNACL_WITH_OPENMP + #pragma omp parallel if (size1 > VIENNACL_OPENMP_VECTOR_MIN_SIZE) +#endif + { + vcl_size_t id = 0; +#ifdef VIENNACL_WITH_OPENMP + id = omp_get_thread_num(); +#endif + vcl_size_t begin = (size1 * id) / thread_count; + vcl_size_t end = (size1 * (id + 1)) / thread_count; + temp[id] = data_vec1[start1]; + + for (vcl_size_t i = begin; i < end; ++i) + { + value_type v = data_vec1[i*inc1+start1];//Note: Assignment to 'vec1' in std::min might be expensive, thus 'v' is used for the function + temp[id] = std::max<value_type>(temp[id],v); + } + } + for (vcl_size_t i = 1; i < thread_count; ++i) + temp[0] = std::max<value_type>( temp[0], temp[i]); + result = temp[0];//Note: Assignment to result might be expensive, thus 'temp' is used for accumulation +} + +/** @brief Computes the minimum of a vector +* +* @param vec1 The vector +* @param result The result scalar +*/ +template<typename NumericT, typename ScalarT> +void min_impl(vector_base<NumericT> const & vec1, + ScalarT & result) +{ + typedef NumericT value_type; + + value_type const * data_vec1 = detail::extract_raw_pointer<value_type>(vec1); + + vcl_size_t start1 = viennacl::traits::start(vec1); + vcl_size_t inc1 = viennacl::traits::stride(vec1); + vcl_size_t size1 = viennacl::traits::size(vec1); + + vcl_size_t thread_count=1; + +#ifdef VIENNACL_WITH_OPENMP + if(size1 > VIENNACL_OPENMP_VECTOR_MIN_SIZE) + thread_count = omp_get_max_threads(); +#endif + + std::vector<value_type> temp(thread_count); + +#ifdef VIENNACL_WITH_OPENMP + #pragma omp parallel if (size1 > VIENNACL_OPENMP_VECTOR_MIN_SIZE) +#endif + { + vcl_size_t id = 0; +#ifdef VIENNACL_WITH_OPENMP + id = omp_get_thread_num(); +#endif + vcl_size_t begin = (size1 * id) / thread_count; + vcl_size_t end = (size1 * (id + 1)) / thread_count; + temp[id] = data_vec1[start1]; + + for (vcl_size_t i = begin; i < end; ++i) + { + value_type v = data_vec1[i*inc1+start1];//Note: Assignment to 'vec1' in std::min might be expensive, thus 'v' is used for the function + temp[id] = std::min<value_type>(temp[id],v); + } + } + for (vcl_size_t i = 1; i < thread_count; ++i) + temp[0] = std::min<value_type>( temp[0], temp[i]); + result = temp[0];//Note: Assignment to result might be expensive, thus 'temp' is used for accumulation +} + +/** @brief Computes the sum of all elements from the vector +* +* @param vec1 The vector +* @param result The result scalar +*/ +template<typename NumericT, typename ScalarT> +void sum_impl(vector_base<NumericT> const & vec1, + ScalarT & result) +{ + typedef NumericT value_type; + + value_type const * data_vec1 = detail::extract_raw_pointer<value_type>(vec1); + + vcl_size_t start1 = viennacl::traits::start(vec1); + vcl_size_t inc1 = viennacl::traits::stride(vec1); + vcl_size_t size1 = viennacl::traits::size(vec1); + + value_type temp = 0; +#ifdef VIENNACL_WITH_OPENMP + #pragma omp parallel for reduction(+:temp) if (size1 > VIENNACL_OPENMP_VECTOR_MIN_SIZE) +#endif + for (long i = 0; i < static_cast<long>(size1); ++i) + temp += data_vec1[static_cast<vcl_size_t>(i)*inc1+start1]; + + result = temp; //Note: Assignment to result might be expensive, thus 'temp' is used for accumulation +} + +/** @brief Computes a plane rotation of two vectors. +* +* Computes (x,y) <- (alpha * x + beta * y, -beta * x + alpha * y) +* +* @param vec1 The first vector +* @param vec2 The second vector +* @param alpha The first transformation coefficient +* @param beta The second transformation coefficient +*/ +template<typename NumericT> +void plane_rotation(vector_base<NumericT> & vec1, + vector_base<NumericT> & vec2, + NumericT alpha, NumericT beta) +{ + typedef NumericT value_type; + + value_type * data_vec1 = detail::extract_raw_pointer<value_type>(vec1); + value_type * data_vec2 = detail::extract_raw_pointer<value_type>(vec2); + + vcl_size_t start1 = viennacl::traits::start(vec1); + vcl_size_t inc1 = viennacl::traits::stride(vec1); + vcl_size_t size1 = viennacl::traits::size(vec1); + + vcl_size_t start2 = viennacl::traits::start(vec2); + vcl_size_t inc2 = viennacl::traits::stride(vec2); + + value_type data_alpha = alpha; + value_type data_beta = beta; + +#ifdef VIENNACL_WITH_OPENMP + #pragma omp parallel for if (size1 > VIENNACL_OPENMP_VECTOR_MIN_SIZE) +#endif + for (long i = 0; i < static_cast<long>(size1); ++i) + { + value_type temp1 = data_vec1[static_cast<vcl_size_t>(i)*inc1+start1]; + value_type temp2 = data_vec2[static_cast<vcl_size_t>(i)*inc2+start2]; + + data_vec1[static_cast<vcl_size_t>(i)*inc1+start1] = data_alpha * temp1 + data_beta * temp2; + data_vec2[static_cast<vcl_size_t>(i)*inc2+start2] = data_alpha * temp2 - data_beta * temp1; + } +} + +namespace detail +{ + /** @brief Implementation of inclusive_scan and exclusive_scan for the host (OpenMP) backend. */ + template<typename NumericT> + void vector_scan_impl(vector_base<NumericT> const & vec1, + vector_base<NumericT> & vec2, + bool is_inclusive) + { + NumericT const * data_vec1 = detail::extract_raw_pointer<NumericT>(vec1); + NumericT * data_vec2 = detail::extract_raw_pointer<NumericT>(vec2); + + vcl_size_t start1 = viennacl::traits::start(vec1); + vcl_size_t inc1 = viennacl::traits::stride(vec1); + vcl_size_t size1 = viennacl::traits::size(vec1); + if (size1 < 1) + return; + + vcl_size_t start2 = viennacl::traits::start(vec2); + vcl_size_t inc2 = viennacl::traits::stride(vec2); + +#ifdef VIENNACL_WITH_OPENMP + if (size1 > VIENNACL_OPENMP_VECTOR_MIN_SIZE) + { + std::vector<NumericT> thread_results(omp_get_max_threads()); + + // inclusive scan each thread segment: + #pragma omp parallel + { + vcl_size_t work_per_thread = (size1 - 1) / thread_results.size() + 1; + vcl_size_t thread_start = work_per_thread * omp_get_thread_num(); + vcl_size_t thread_stop = std::min<vcl_size_t>(thread_start + work_per_thread, size1); + + NumericT thread_sum = 0; + for(vcl_size_t i = thread_start; i < thread_stop; i++) + thread_sum += data_vec1[i * inc1 + start1]; + + thread_results[omp_get_thread_num()] = thread_sum; + } + + // exclusive-scan of thread results: + NumericT current_offset = 0; + for (vcl_size_t i=0; i<thread_results.size(); ++i) + { + NumericT tmp = thread_results[i]; + thread_results[i] = current_offset; + current_offset += tmp; + } + + // exclusive/inclusive scan of each segment with correct offset: + #pragma omp parallel + { + vcl_size_t work_per_thread = (size1 - 1) / thread_results.size() + 1; + vcl_size_t thread_start = work_per_thread * omp_get_thread_num(); + vcl_size_t thread_stop = std::min<vcl_size_t>(thread_start + work_per_thread, size1); + + NumericT thread_sum = thread_results[omp_get_thread_num()]; + if (is_inclusive) + { + for(vcl_size_t i = thread_start; i < thread_stop; i++) + { + thread_sum += data_vec1[i * inc1 + start1]; + data_vec2[i * inc2 + start2] = thread_sum; + } + } + else + { + for(vcl_size_t i = thread_start; i < thread_stop; i++) + { + NumericT tmp = data_vec1[i * inc1 + start1]; + data_vec2[i * inc2 + start2] = thread_sum; + thread_sum += tmp; + } + } + } + } else +#endif + { + NumericT sum = 0; + if (is_inclusive) + { + for(vcl_size_t i = 0; i < size1; i++) + { + sum += data_vec1[i * inc1 + start1]; + data_vec2[i * inc2 + start2] = sum; + } + } + else + { + for(vcl_size_t i = 0; i < size1; i++) + { + NumericT tmp = data_vec1[i * inc1 + start1]; + data_vec2[i * inc2 + start2] = sum; + sum += tmp; + } + } + } + + } +} + +/** @brief This function implements an inclusive scan on the host using OpenMP. +* +* Given an element vector (x_0, x_1, ..., x_{n-1}), +* this routine computes (x_0, x_0 + x_1, ..., x_0 + x_1 + ... + x_{n-1}) +* +* @param vec1 Input vector: Gets overwritten by the routine. +* @param vec2 The output vector. Either idential to vec1 or non-overlapping. +*/ +template<typename NumericT> +void inclusive_scan(vector_base<NumericT> const & vec1, + vector_base<NumericT> & vec2) +{ + detail::vector_scan_impl(vec1, vec2, true); +} + +/** @brief This function implements an exclusive scan on the host using OpenMP. +* +* Given an element vector (x_0, x_1, ..., x_{n-1}), +* this routine computes (0, x_0, x_0 + x_1, ..., x_0 + x_1 + ... + x_{n-2}) +* +* @param vec1 Input vector: Gets overwritten by the routine. +* @param vec2 The output vector. Either idential to vec1 or non-overlapping. +*/ +template<typename NumericT> +void exclusive_scan(vector_base<NumericT> const & vec1, + vector_base<NumericT> & vec2) +{ + detail::vector_scan_impl(vec1, vec2, false); +} + + +} //namespace host_based +} //namespace linalg +} //namespace viennacl + + +#endif http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/ichol.hpp ---------------------------------------------------------------------- diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/ichol.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/ichol.hpp new file mode 100644 index 0000000..1038b2b --- /dev/null +++ b/native-viennaCL/src/main/cpp/viennacl/linalg/ichol.hpp @@ -0,0 +1,228 @@ +#ifndef VIENNACL_LINALG_ICHOL_HPP_ +#define VIENNACL_LINALG_ICHOL_HPP_ + +/* ========================================================================= + Copyright (c) 2010-2016, Institute for Microelectronics, + Institute for Analysis and Scientific Computing, + TU Wien. + Portions of this software are copyright by UChicago Argonne, LLC. + + ----------------- + ViennaCL - The Vienna Computing Library + ----------------- + + Project Head: Karl Rupp [email protected] + + (A list of authors and contributors can be found in the manual) + + License: MIT (X11), see file LICENSE in the base directory +============================================================================= */ + +/** @file viennacl/linalg/ichol.hpp + @brief Implementations of incomplete Cholesky factorization preconditioners with static nonzero pattern. +*/ + +#include <vector> +#include <cmath> +#include <iostream> +#include "viennacl/forwards.h" +#include "viennacl/tools/tools.hpp" +#include "viennacl/compressed_matrix.hpp" + +#include "viennacl/linalg/host_based/common.hpp" + +#include <map> + +namespace viennacl +{ +namespace linalg +{ + +/** @brief A tag for incomplete Cholesky factorization with static pattern (ILU0) +*/ +class ichol0_tag {}; + + +/** @brief Implementation of a ILU-preconditioner with static pattern. Optimized version for CSR matrices. + * + * Refer to Chih-Jen Lin and Jorge J. Moré, Incomplete Cholesky Factorizations with Limited Memory, SIAM J. Sci. Comput., 21(1), 24â45 + * for one of many descriptions of incomplete Cholesky Factorizations + * + * @param A The input matrix in CSR format + * // param tag An ichol0_tag in order to dispatch among several other preconditioners. + */ +template<typename NumericT> +void precondition(viennacl::compressed_matrix<NumericT> & A, ichol0_tag const & /* tag */) +{ + assert( (viennacl::traits::context(A).memory_type() == viennacl::MAIN_MEMORY) && bool("System matrix must reside in main memory for ICHOL0") ); + + NumericT * elements = viennacl::linalg::host_based::detail::extract_raw_pointer<NumericT>(A.handle()); + unsigned int const * row_buffer = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(A.handle1()); + unsigned int const * col_buffer = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(A.handle2()); + + //std::cout << A.size1() << std::endl; + for (vcl_size_t i=0; i<A.size1(); ++i) + { + unsigned int row_i_begin = row_buffer[i]; + unsigned int row_i_end = row_buffer[i+1]; + + // get a_ii: + NumericT a_ii = 0; + for (unsigned int buf_index_aii = row_i_begin; buf_index_aii < row_i_end; ++buf_index_aii) + { + if (col_buffer[buf_index_aii] == i) + { + a_ii = std::sqrt(elements[buf_index_aii]); + elements[buf_index_aii] = a_ii; + break; + } + } + + // Now scale column/row i, i.e. A(k, i) /= A(i, i) + for (unsigned int buf_index_aii = row_i_begin; buf_index_aii < row_i_end; ++buf_index_aii) + { + if (col_buffer[buf_index_aii] > i) + elements[buf_index_aii] /= a_ii; + } + + // Now compute A(k, j) -= A(k, i) * A(j, i) for all nonzero k, j in column i: + for (unsigned int buf_index_j = row_i_begin; buf_index_j < row_i_end; ++buf_index_j) + { + unsigned int j = col_buffer[buf_index_j]; + if (j <= i) + continue; + + NumericT a_ji = elements[buf_index_j]; + + for (unsigned int buf_index_k = row_i_begin; buf_index_k < row_i_end; ++buf_index_k) + { + unsigned int k = col_buffer[buf_index_k]; + if (k < j) + continue; + + NumericT a_ki = elements[buf_index_k]; + + //Now check whether A(k, j) is in nonzero pattern: + unsigned int row_j_begin = row_buffer[j]; + unsigned int row_j_end = row_buffer[j+1]; + for (unsigned int buf_index_kj = row_j_begin; buf_index_kj < row_j_end; ++buf_index_kj) + { + if (col_buffer[buf_index_kj] == k) + { + elements[buf_index_kj] -= a_ki * a_ji; + break; + } + } + } + } + + } + +} + + +/** @brief Incomplete Cholesky preconditioner class with static pattern (ICHOL0), can be supplied to solve()-routines +*/ +template<typename MatrixT> +class ichol0_precond +{ + typedef typename MatrixT::value_type NumericType; + +public: + ichol0_precond(MatrixT const & mat, ichol0_tag const & tag) : tag_(tag), LLT(mat.size1(), mat.size2(), viennacl::context(viennacl::MAIN_MEMORY)) + { + //initialize preconditioner: + //std::cout << "Start CPU precond" << std::endl; + init(mat); + //std::cout << "End CPU precond" << std::endl; + } + + template<typename VectorT> + void apply(VectorT & vec) const + { + unsigned int const * row_buffer = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(LLT.handle1()); + unsigned int const * col_buffer = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(LLT.handle2()); + NumericType const * elements = viennacl::linalg::host_based::detail::extract_raw_pointer<NumericType>(LLT.handle()); + + // Note: L is stored in a column-oriented fashion, i.e. transposed w.r.t. the row-oriented layout. Thus, the factorization A = L L^T holds L in the upper triangular part of A. + viennacl::linalg::host_based::detail::csr_trans_inplace_solve<NumericType>(row_buffer, col_buffer, elements, vec, LLT.size2(), lower_tag()); + viennacl::linalg::host_based::detail::csr_inplace_solve<NumericType>(row_buffer, col_buffer, elements, vec, LLT.size2(), upper_tag()); + } + +private: + void init(MatrixT const & mat) + { + viennacl::context host_ctx(viennacl::MAIN_MEMORY); + viennacl::switch_memory_context(LLT, host_ctx); + + viennacl::copy(mat, LLT); + viennacl::linalg::precondition(LLT, tag_); + } + + ichol0_tag const & tag_; + viennacl::compressed_matrix<NumericType> LLT; +}; + + +/** @brief ILU0 preconditioner class, can be supplied to solve()-routines. +* +* Specialization for compressed_matrix +*/ +template<typename NumericT, unsigned int AlignmentV> +class ichol0_precond< compressed_matrix<NumericT, AlignmentV> > +{ + typedef compressed_matrix<NumericT, AlignmentV> MatrixType; + +public: + ichol0_precond(MatrixType const & mat, ichol0_tag const & tag) : tag_(tag), LLT(mat.size1(), mat.size2(), viennacl::traits::context(mat)) + { + //initialize preconditioner: + //std::cout << "Start GPU precond" << std::endl; + init(mat); + //std::cout << "End GPU precond" << std::endl; + } + + void apply(vector<NumericT> & vec) const + { + if (viennacl::traits::context(vec).memory_type() != viennacl::MAIN_MEMORY) + { + viennacl::context host_ctx(viennacl::MAIN_MEMORY); + viennacl::context old_ctx = viennacl::traits::context(vec); + + viennacl::switch_memory_context(vec, host_ctx); + viennacl::linalg::inplace_solve(trans(LLT), vec, lower_tag()); + viennacl::linalg::inplace_solve( LLT , vec, upper_tag()); + viennacl::switch_memory_context(vec, old_ctx); + } + else //apply ILU0 directly: + { + // Note: L is stored in a column-oriented fashion, i.e. transposed w.r.t. the row-oriented layout. Thus, the factorization A = L L^T holds L in the upper triangular part of A. + viennacl::linalg::inplace_solve(trans(LLT), vec, lower_tag()); + viennacl::linalg::inplace_solve( LLT , vec, upper_tag()); + } + } + +private: + void init(MatrixType const & mat) + { + viennacl::context host_ctx(viennacl::MAIN_MEMORY); + viennacl::switch_memory_context(LLT, host_ctx); + LLT = mat; + + viennacl::linalg::precondition(LLT, tag_); + } + + ichol0_tag const & tag_; + viennacl::compressed_matrix<NumericT> LLT; +}; + +} +} + + + + +#endif + + +
