Hei,
I replaced the linking line with
//usr/lib64/mpi/gcc/openmpi3/bin/mpicxx -march=native -fopenmp-simd
-DMKL_LP64 -m64
CMakeFiles/armadillo_with_PETSc.dir/Unity/unity_0_cxx.cxx.o -o
bin/armadillo_with_PETSc
-Wl,-rpath,/opt/boost/lib:/opt/fftw3/lib64:/opt/petsc_release/lib
/usr/lib64/libgsl.so /usr/lib64/libgslcblas.so -lgfortran
-L${MKLROOT}/lib/intel64 -Wl,--no-as-needed -lmkl_intel_lp64
-lmkl_gnu_thread -lmkl_core -lgomp -lpthread -lm -ldl
/opt/boost/lib/libboost_filesystem.so.1.72.0
/opt/boost/lib/libboost_mpi.so.1.72.0
/opt/boost/lib/libboost_program_options.so.1.72.0
/opt/boost/lib/libboost_serialization.so.1.72.0
/opt/fftw3/lib64/libfftw3.so /opt/fftw3/lib64/libfftw3_mpi.so
/opt/petsc_release/lib/libpetsc.so
/usr/lib64/gcc/x86_64-suse-linux/9/libgomp.so
/
and now the results are correct. Nevertheless, when comparing the loop
in line 26-28 in file test_scaling.cpp
/#pragma omp parallel for//
// for(int i = 0; i < r_0 * r_1; ++i)//
// *(out_mat_ptr + i) = (*(in_mat_ptr + i) * scaling_factor);/
the version without /#pragma omp parallel/ for is significantly faster
(i.e. 18 s vs 28 s) compared to the version with /omp./ Why is there
still such a big difference?
Thanks!
Am 17.02.21 um 17:10 schrieb Jed Brown:
> You're using an MKL linked to Intel's OpenMP. I could imagine there being
> symbol conflicts causing MKL to compute wrong results if libgomp symbols are
> picked up.
>
> Note that -fopenmp-simd does not require linking -- it just gives the
> compiler hints about how to vectorize. So you can probably keep using it and
> just stop passing libgomp.so. Alternatively, you can link MKL to work with
> libgomp (see the MKL link advisor).
>
> Roland Richter <[email protected]> writes:
>
>> Hei,
>>
>> when compiling the attached files using the following compilation line
>>
>> //usr/lib64/mpi/gcc/openmpi3/bin/mpicxx -DBOOST_ALL_NO_LIB
>> -DBOOST_FILESYSTEM_DYN_LINK -DBOOST_MPI_DYN_LINK
>> -DBOOST_PROGRAM_OPTIONS_DYN_LINK -DBOOST_SERIALIZATION_DYN_LINK
>> -DUSE_CUDA
>> -I/home/roland/Dokumente/C++-Projekte/armadillo_with_PETSc/include
>> -I/opt/intel/compilers_and_libraries_2020.2.254/linux/mkl/include
>> -I/opt/armadillo/include -isystem /opt/petsc_release/include -isystem
>> /opt/fftw3/include -isystem /opt/boost/include -march=native
>> -fopenmp-simd -DMKL_LP64 -m64 -Wall -Wextra -pedantic -fPIC -flto -O2
>> -funroll-loops -funroll-all-loops -fstrict-aliasing -mavx -march=native
>> -fopenmp -std=gnu++17 -c <source_files> -o <target_files>/
>>
>> and linking them with
>>
>> //usr/lib64/mpi/gcc/openmpi3/bin/mpicxx -march=native -fopenmp-simd
>> -DMKL_LP64 -m64 <target_files> -o bin/armadillo_with_PETSc
>> -Wl,-rpath,/opt/boost/lib:/opt/fftw3/lib64:/opt/petsc_release/lib
>> /usr/lib64/libgsl.so /usr/lib64/libgslcblas.so -lgfortran
>> /opt/intel/mkl/lib/intel64/libmkl_rt.so
>> /opt/boost/lib/libboost_filesystem.so.1.72.0
>> /opt/boost/lib/libboost_mpi.so.1.72.0
>> /opt/boost/lib/libboost_program_options.so.1.72.0
>> /opt/boost/lib/libboost_serialization.so.1.72.0
>> /opt/fftw3/lib64/libfftw3.so /opt/fftw3/lib64/libfftw3_mpi.so
>> /opt/petsc_release/lib/libpetsc.so
>> /usr/lib64/gcc/x86_64-suse-linux/9/libgomp.so/
>>
>> my output is
>>
>> /Arma and PETSc/MatScale are equal:
>> 0//
>> //Arma-time for a matrix size of [1024,
>> 8192]: 24955//
>> //PETSc-time, pointer for a matrix size of [1024,
>> 8192]: 28283//
>> //PETSc-time, MatScale for a matrix size of [1024,
>> 8192]: 23138/
>>
>> but when removing the explicit call to openmp (i.e. removing /-fopenmp/
>> and //usr/lib64/gcc/x86_64-suse-linux/9/libgomp.so/) my result is
>>
>> /Arma and PETSc/MatScale are equal:
>> 1//
>> //Arma-time for a matrix size of [1024,
>> 8192]: 24878//
>> //PETSc-time, pointer for a matrix size of [1024,
>> 8192]: 18942//
>> //PETSc-time, MatScale for a matrix size of [1024,
>> 8192]: 23350/
>>
>> even though both times the executable is linked to
>>
>> / libmkl_intel_lp64.so =>
>> /opt/intel/compilers_and_libraries_2020.2.254/linux/mkl/lib/intel64_lin/libmkl_intel_lp64.so
>> (0x00007f9eebd70000)//
>> // libmkl_core.so =>
>> /opt/intel/compilers_and_libraries_2020.2.254/linux/mkl/lib/intel64_lin/libmkl_core.so
>> (0x00007f9ee77aa000)//
>> // libmkl_intel_thread.so =>
>> /opt/intel/compilers_and_libraries_2020.2.254/linux/mkl/lib/intel64_lin/libmkl_intel_thread.so
>> (0x00007f9ee42c3000)//
>> // libiomp5.so =>
>> /opt/intel/compilers_and_libraries_2020.2.254/linux/compiler/lib/intel64_lin/libiomp5.so
>> (0x00007f9ee3ebd000)//
>> // libgomp.so.1 => /usr/lib64/libgomp.so.1 (0x00007f9ea98bd000)/
>>
>> via the petsc-library. Why does the execution time vary by so much, and
>> why does my result change when calling MatScale (i.e. returning wrong
>> results) when explicitly linking to OpenMP?
>>
>> Thanks!
>>
>> Regards,
>>
>> Roland
>>
>> #include <test_scaling.hpp>
>>
>> int main(int argc, char **args) {
>> PetscMPIInt rank, size;
>> PetscInitialize(&argc, &args, (char*) 0, NULL);
>>
>> MPI_Comm_size(PETSC_COMM_WORLD, &size);
>> MPI_Comm_rank(PETSC_COMM_WORLD, &rank);
>>
>> test_scaling (1024, 8192, false);
>> PetscFinalize();
>> return 0;
>> }
>> #include <test_scaling.hpp>
>>
>> void test_scaling_arma(const arma::cx_mat & in_mat,
>> arma::cx_mat &out_mat,
>> const arma::cx_double
>> &scaling_factor) {
>> out_mat = in_mat;
>> out_mat *= scaling_factor;
>> }
>>
>> void test_scaling_petsc(const Mat &in_mat,
>> Mat &out_mat,
>> const PetscScalar
>> &scaling_factor) {
>> MatZeroEntries(out_mat);
>> MatAXPY(out_mat, scaling_factor, in_mat, SAME_NONZERO_PATTERN);
>> }
>>
>> void test_scaling_petsc_pointer(const Mat &in_mat,
>> Mat &out_mat,
>> const
>> PetscScalar &scaling_factor) {
>> const PetscScalar *in_mat_ptr;
>> PetscScalar *out_mat_ptr;
>> MatDenseGetArrayRead (in_mat, &in_mat_ptr);
>> MatDenseGetArrayWrite (out_mat, &out_mat_ptr);
>> PetscInt r_0, r_1;
>> MatGetLocalSize (out_mat, &r_0, &r_1);
>> #pragma omp parallel for
>> for(int i = 0; i < r_0 * r_1; ++i)
>> *(out_mat_ptr + i) = (*(in_mat_ptr + i) * scaling_factor);
>>
>> MatDenseRestoreArrayRead (in_mat, &in_mat_ptr);
>> MatDenseRestoreArrayWrite(out_mat, &out_mat_ptr);
>> }
>>
>> void test_scaling(const size_t matrix_size_rows, const size_t
>> matrix_size_cols, const bool print_matrices) {
>> PetscMPIInt rank, size;
>>
>> MPI_Comm_size(PETSC_COMM_WORLD, &size);
>> MPI_Comm_rank(PETSC_COMM_WORLD, &rank);
>>
>> arma::cx_mat in_mat = arma::zeros<arma::cx_mat>(matrix_size_rows,
>> matrix_size_cols),
>> out_mat = arma::zeros<arma::cx_mat>(matrix_size_rows,
>> matrix_size_cols);
>> arma::cx_rowvec matrix_vec =
>> arma::conv_to<arma::cx_rowvec>::from(arma::linspace<arma::rowvec>(0,
>> matrix_size_cols - 1, matrix_size_cols));
>> in_mat.each_row([&](arma::cx_rowvec &a){
>> a = matrix_vec;
>> });
>>
>> Mat petsc_in_mat, petsc_out_mat;
>> arma::cx_mat petsc_out_comparison_mat =
>> arma::zeros<arma::cx_mat>(matrix_size_rows, matrix_size_cols);
>> MatCreateDense(PETSC_COMM_WORLD, PETSC_DECIDE, PETSC_DECIDE,
>> matrix_size_rows, matrix_size_cols, NULL, &petsc_in_mat);
>> MatCreateDense(PETSC_COMM_WORLD, PETSC_DECIDE, PETSC_DECIDE,
>> matrix_size_rows, matrix_size_cols, NULL, &petsc_out_mat);
>>
>> create_data_in_PETSc_from_scratch (in_mat, petsc_in_mat);
>> MatZeroEntries(petsc_out_mat);
>>
>> const std::complex<double> scaling_factor{1. / matrix_size_cols, 0.};
>>
>> test_scaling_arma (in_mat, out_mat, scaling_factor);
>> test_scaling_petsc (petsc_in_mat, petsc_out_mat, scaling_factor);
>>
>> //Benchmark
>> auto t1 = std::chrono::high_resolution_clock::now();
>> for(size_t i = 0; i < bench_rounds; ++i) {
>> test_scaling_arma (in_mat, out_mat, scaling_factor);
>> }
>> auto t2 = std::chrono::high_resolution_clock::now();
>>
>> auto t3 = std::chrono::high_resolution_clock::now();
>> for(size_t i = 0; i < bench_rounds; ++i) {
>> test_scaling_petsc_pointer (petsc_in_mat, petsc_out_mat,
>> scaling_factor);
>> }
>> auto t4 = std::chrono::high_resolution_clock::now();
>>
>> auto t5 = std::chrono::high_resolution_clock::now();
>> for(size_t i = 0; i < bench_rounds; ++i) {
>> test_scaling_petsc (petsc_in_mat, petsc_out_mat,
>> scaling_factor);
>> }
>> auto t6 = std::chrono::high_resolution_clock::now();
>>
>>
>> retrieve_data_from_PETSc (petsc_out_mat, petsc_out_comparison_mat,
>> matrix_size_cols, matrix_size_rows);
>>
>> if(print_matrices && rank == 0) {
>> std::cout << "In-matrix, ARMA:\n" << in_mat
>> << "\n\nOut-matrix, ARMA:\n" << out_mat
>> << "\n\nComparison-out-matrix, ARMA:\n" <<
>> petsc_out_comparison_mat
>> << "\n\nDifference: \n" <<
>> arma::abs(petsc_out_comparison_mat - out_mat)
>> <<'\n';
>> }
>> if(rank == 0) {
>> std::cout << "Arma and PETSc/MatScale are equal:\t\t\t\t\t" <<
>> arma::approx_equal(out_mat, petsc_out_comparison_mat, "reldiff", 1e-8) <<
>> '\n';
>> std::cout << "Arma-time for a matrix size of ["
>> << matrix_size_rows << ", "
>> << matrix_size_cols << "]:\t\t\t\t"
>> << std::chrono::duration_cast<std::chrono::milliseconds>( t2 - t1 ).count()
>> << '\n';
>> std::cout << "PETSc-time, pointer for a matrix size of ["
>> << matrix_size_rows << ", "
>> << matrix_size_cols << "]:\t\t\t"
>> << std::chrono::duration_cast<std::chrono::milliseconds>( t4 - t3 ).count()
>> << '\n';
>> std::cout << "PETSc-time, MatScale for a matrix size of ["
>> << matrix_size_rows << ", "
>> << matrix_size_cols << "]:\t\t\t"
>> << std::chrono::duration_cast<std::chrono::milliseconds>( t6 - t5 ).count()
>> << '\n';
>> }
>> MatDestroy (&petsc_in_mat);
>> MatDestroy (&petsc_out_mat);
>> }
>> #include <helper_functions.hpp>
>>
>> void retrieve_data_from_PETSc(const Mat petsc_mat, arma::cx_mat &out_data,
>> const arma::uword
>> Ntime, const arma::uword Nradius) {
>> PetscMPIInt size;
>> MPI_Comm_size(PETSC_COMM_WORLD, &size);
>> if(out_data.n_rows != Ntime && out_data.n_cols != Nradius) {
>> out_data = arma::zeros<arma::cx_mat>(Ntime, Nradius);
>> }
>> Mat local_mat;
>> arma::Col<int> vector_indices_radius =
>> arma::linspace<arma::Col<int>>(0, Nradius - 1, Nradius);
>> arma::Col<int> vector_indices_time = arma::linspace<arma::Col<int>>(0,
>> Ntime - 1, Ntime);
>> //MatCreateRedundantMatrix(petsc_mat, Ntime * Nradius, MPI_COMM_NULL,
>> MAT_INITIAL_MATRIX, &local_mat);
>> MatCreateRedundantMatrix(petsc_mat, size, MPI_COMM_NULL,
>> MAT_INITIAL_MATRIX, &local_mat);
>> MatAssemblyBegin(local_mat, MAT_FINAL_ASSEMBLY);
>> MatAssemblyEnd(local_mat, MAT_FINAL_ASSEMBLY);
>> MatGetValues(local_mat, Nradius, vector_indices_radius.memptr(), Ntime,
>> vector_indices_time.memptr(), out_data.memptr());
>> MatDestroy(&local_mat);
>> out_data = out_data.st();
>> }
>>
>> void store_data_in_PETSc(const arma::cx_mat &in_data, Mat &petsc_mat) {
>> const arma::uword Ntime = in_data.n_cols;
>> const arma::uword Nradius = in_data.n_rows;
>> arma::Col<int> vector_indices_radius =
>> arma::linspace<arma::Col<int>>(0, Nradius - 1, Nradius);
>> arma::Col<int> vector_indices_time = arma::linspace<arma::Col<int>>(0,
>> Ntime - 1, Ntime);
>> MatZeroEntries(petsc_mat);
>> MatAssemblyBegin(petsc_mat, MAT_FINAL_ASSEMBLY);
>> MatAssemblyEnd(petsc_mat, MAT_FINAL_ASSEMBLY);
>> arma::cx_mat local_mat = in_data.st();
>> MatSetValues(petsc_mat, Nradius, vector_indices_radius.memptr(), Ntime,
>> vector_indices_time.memptr(), local_mat.memptr(), INSERT_VALUES);
>> MatAssemblyBegin(petsc_mat, MAT_FINAL_ASSEMBLY);
>> MatAssemblyEnd(petsc_mat, MAT_FINAL_ASSEMBLY);
>> }
>>
>> void create_data_in_PETSc_from_scratch(const arma::cx_mat &in_data, Mat
>> &petsc_mat) {
>> const arma::uword Ntime = in_data.n_cols;
>> const arma::uword Nradius = in_data.n_rows;
>> MatZeroEntries(petsc_mat);
>> MatAssemblyBegin(petsc_mat, MAT_FINAL_ASSEMBLY);
>> MatAssemblyEnd(petsc_mat, MAT_FINAL_ASSEMBLY);
>> for(int i = 0; i < (int)Ntime; ++i){
>> for(int j = 0; j < (int)Nradius; ++j) {
>> MatSetValue(petsc_mat, j, i, i, INSERT_VALUES);
>> }
>> }
>> MatAssemblyBegin(petsc_mat, MAT_FINAL_ASSEMBLY);
>> MatAssemblyEnd(petsc_mat, MAT_FINAL_ASSEMBLY);
>> }
>> #ifndef TEST_SCALING_HPP
>> #define TEST_SCALING_HPP
>>
>> #include <helper_functions.hpp>
>>
>> void test_scaling_arma(const arma::cx_mat & in_mat,
>> arma::cx_mat &out_mat,
>> const arma::cx_double
>> &scaling_factor);
>>
>> void test_scaling_petsc(const Mat &in_mat,
>> Mat &out_mat,
>> const PetscScalar
>> &scaling_factor);
>>
>> void test_scaling_petsc_pointer(const Mat &in_mat,
>> Mat &out_mat,
>> const
>> PetscScalar &scaling_factor);
>>
>> void test_scaling(const size_t matrix_size_rows, const size_t
>> matrix_size_cols, const bool print_matrices);
>>
>> #endif // TEST_SCALING_HPP
>> #ifndef HELPER_FUNCTIONS_HPP
>> #define HELPER_FUNCTIONS_HPP
>>
>> #include <iostream>
>>
>> #include <armadillo>
>> #include <petscts.h>
>> #include <petscmat.h>
>> #include <petscvec.h>
>> #include <fftw3.h>
>> #include <fftw3-mpi.h>
>> #include <metis.h>
>> #include <chrono>
>>
>> #include <boost/numeric/odeint.hpp>
>>
>> constexpr int bench_rounds = 1000;
>>
>> void retrieve_data_from_PETSc(const Mat petsc_mat, arma::cx_mat &out_data,
>> const arma::uword
>> Ntime, const arma::uword Nradius);
>>
>> void store_data_in_PETSc(const arma::cx_mat &in_data, Mat &petsc_mat);
>>
>> void create_data_in_PETSc_from_scratch(const arma::cx_mat &in_data, Mat
>> &petsc_mat);
>>
>> #endif // HELPER_FUNCTIONS_HPP