Hei,

I replaced the linking line with

//usr/lib64/mpi/gcc/openmpi3/bin/mpicxx  -march=native -fopenmp-simd
-DMKL_LP64 -m64
CMakeFiles/armadillo_with_PETSc.dir/Unity/unity_0_cxx.cxx.o -o
bin/armadillo_with_PETSc 
-Wl,-rpath,/opt/boost/lib:/opt/fftw3/lib64:/opt/petsc_release/lib
/usr/lib64/libgsl.so /usr/lib64/libgslcblas.so -lgfortran 
-L${MKLROOT}/lib/intel64 -Wl,--no-as-needed -lmkl_intel_lp64
-lmkl_gnu_thread -lmkl_core -lgomp -lpthread -lm -ldl
/opt/boost/lib/libboost_filesystem.so.1.72.0
/opt/boost/lib/libboost_mpi.so.1.72.0
/opt/boost/lib/libboost_program_options.so.1.72.0
/opt/boost/lib/libboost_serialization.so.1.72.0
/opt/fftw3/lib64/libfftw3.so /opt/fftw3/lib64/libfftw3_mpi.so
/opt/petsc_release/lib/libpetsc.so
/usr/lib64/gcc/x86_64-suse-linux/9/libgomp.so
/

and now the results are correct. Nevertheless, when comparing the loop
in line 26-28 in file test_scaling.cpp

/#pragma omp parallel for//
//    for(int i = 0; i < r_0 * r_1; ++i)//
//        *(out_mat_ptr + i) = (*(in_mat_ptr + i) * scaling_factor);/

the version without /#pragma omp parallel/ for is significantly faster
(i.e. 18 s vs 28 s) compared to the version with /omp./ Why is there
still such a big difference?

Thanks!

Am 17.02.21 um 17:10 schrieb Jed Brown:
> You're using an MKL linked to Intel's OpenMP. I could imagine there being 
> symbol conflicts causing MKL to compute wrong results if libgomp symbols are 
> picked up.
>
> Note that -fopenmp-simd does not require linking -- it just gives the 
> compiler hints about how to vectorize. So you can probably keep using it and 
> just stop passing libgomp.so. Alternatively, you can link MKL to work with 
> libgomp (see the MKL link advisor).
>
> Roland Richter <[email protected]> writes:
>
>> Hei,
>>
>> when compiling the attached files using the following compilation line
>>
>> //usr/lib64/mpi/gcc/openmpi3/bin/mpicxx -DBOOST_ALL_NO_LIB
>> -DBOOST_FILESYSTEM_DYN_LINK -DBOOST_MPI_DYN_LINK
>> -DBOOST_PROGRAM_OPTIONS_DYN_LINK -DBOOST_SERIALIZATION_DYN_LINK
>> -DUSE_CUDA
>> -I/home/roland/Dokumente/C++-Projekte/armadillo_with_PETSc/include
>> -I/opt/intel/compilers_and_libraries_2020.2.254/linux/mkl/include
>> -I/opt/armadillo/include -isystem /opt/petsc_release/include -isystem
>> /opt/fftw3/include -isystem /opt/boost/include -march=native
>> -fopenmp-simd -DMKL_LP64 -m64 -Wall -Wextra -pedantic -fPIC -flto -O2
>> -funroll-loops -funroll-all-loops -fstrict-aliasing -mavx -march=native
>> -fopenmp -std=gnu++17 -c <source_files> -o <target_files>/
>>
>> and linking them with 
>>
>> //usr/lib64/mpi/gcc/openmpi3/bin/mpicxx  -march=native -fopenmp-simd
>> -DMKL_LP64 -m64 <target_files> -o bin/armadillo_with_PETSc 
>> -Wl,-rpath,/opt/boost/lib:/opt/fftw3/lib64:/opt/petsc_release/lib
>> /usr/lib64/libgsl.so /usr/lib64/libgslcblas.so -lgfortran
>> /opt/intel/mkl/lib/intel64/libmkl_rt.so
>> /opt/boost/lib/libboost_filesystem.so.1.72.0
>> /opt/boost/lib/libboost_mpi.so.1.72.0
>> /opt/boost/lib/libboost_program_options.so.1.72.0
>> /opt/boost/lib/libboost_serialization.so.1.72.0
>> /opt/fftw3/lib64/libfftw3.so /opt/fftw3/lib64/libfftw3_mpi.so
>> /opt/petsc_release/lib/libpetsc.so
>> /usr/lib64/gcc/x86_64-suse-linux/9/libgomp.so/
>>
>> my output is 
>>
>> /Arma and PETSc/MatScale are equal:                                     
>>         0//
>> //Arma-time for a matrix size of [1024,
>> 8192]:                                      24955//
>> //PETSc-time, pointer for a matrix size of [1024,
>> 8192]:                      28283//
>> //PETSc-time, MatScale for a matrix size of [1024,
>> 8192]:                  23138/
>>
>> but when removing the explicit call to openmp (i.e. removing /-fopenmp/
>> and //usr/lib64/gcc/x86_64-suse-linux/9/libgomp.so/) my result is
>>
>> /Arma and PETSc/MatScale are equal:                                     
>>        1//
>> //Arma-time for a matrix size of [1024,
>> 8192]:                                     24878//
>> //PETSc-time, pointer for a matrix size of [1024,
>> 8192]:                     18942//
>> //PETSc-time, MatScale for a matrix size of [1024,
>> 8192]:                 23350/
>>
>> even though both times the executable is linked to
>>
>> /        libmkl_intel_lp64.so =>
>> /opt/intel/compilers_and_libraries_2020.2.254/linux/mkl/lib/intel64_lin/libmkl_intel_lp64.so
>> (0x00007f9eebd70000)//
>> //        libmkl_core.so =>
>> /opt/intel/compilers_and_libraries_2020.2.254/linux/mkl/lib/intel64_lin/libmkl_core.so
>> (0x00007f9ee77aa000)//
>> //        libmkl_intel_thread.so =>
>> /opt/intel/compilers_and_libraries_2020.2.254/linux/mkl/lib/intel64_lin/libmkl_intel_thread.so
>> (0x00007f9ee42c3000)//
>> //        libiomp5.so =>
>> /opt/intel/compilers_and_libraries_2020.2.254/linux/compiler/lib/intel64_lin/libiomp5.so
>> (0x00007f9ee3ebd000)//
>> //        libgomp.so.1 => /usr/lib64/libgomp.so.1 (0x00007f9ea98bd000)/
>>
>> via the petsc-library. Why does the execution time vary by so much, and
>> why does my result change when calling MatScale (i.e. returning wrong
>> results) when explicitly linking to OpenMP?
>>
>> Thanks!
>>
>> Regards,
>>
>> Roland
>>
>> #include <test_scaling.hpp>
>>
>> int main(int argc, char **args) {
>>      PetscMPIInt rank, size;
>>      PetscInitialize(&argc, &args, (char*) 0, NULL);
>>
>>      MPI_Comm_size(PETSC_COMM_WORLD, &size);
>>      MPI_Comm_rank(PETSC_COMM_WORLD, &rank);
>>
>>      test_scaling (1024, 8192, false);
>>      PetscFinalize();
>>      return 0;
>> }
>> #include <test_scaling.hpp>
>>
>> void test_scaling_arma(const arma::cx_mat & in_mat,
>>                                         arma::cx_mat &out_mat,
>>                                         const arma::cx_double 
>> &scaling_factor) {
>>      out_mat = in_mat;
>>      out_mat *= scaling_factor;
>> }
>>
>> void test_scaling_petsc(const Mat &in_mat,
>>                                              Mat &out_mat,
>>                                              const PetscScalar 
>> &scaling_factor) {
>>      MatZeroEntries(out_mat);
>>      MatAXPY(out_mat, scaling_factor, in_mat, SAME_NONZERO_PATTERN);
>> }
>>
>> void test_scaling_petsc_pointer(const Mat &in_mat,
>>                                                              Mat &out_mat,
>>                                                              const 
>> PetscScalar &scaling_factor) {
>>      const PetscScalar *in_mat_ptr;
>>      PetscScalar *out_mat_ptr;
>>      MatDenseGetArrayRead (in_mat, &in_mat_ptr);
>>      MatDenseGetArrayWrite (out_mat, &out_mat_ptr);
>>      PetscInt r_0, r_1;
>>      MatGetLocalSize (out_mat, &r_0, &r_1);
>> #pragma omp parallel for
>>      for(int i = 0; i < r_0 * r_1; ++i)
>>              *(out_mat_ptr + i) = (*(in_mat_ptr + i) * scaling_factor);
>>
>>      MatDenseRestoreArrayRead (in_mat, &in_mat_ptr);
>>      MatDenseRestoreArrayWrite(out_mat, &out_mat_ptr);
>> }
>>
>> void test_scaling(const size_t matrix_size_rows, const size_t 
>> matrix_size_cols, const bool print_matrices) {
>>      PetscMPIInt rank, size;
>>
>>      MPI_Comm_size(PETSC_COMM_WORLD, &size);
>>      MPI_Comm_rank(PETSC_COMM_WORLD, &rank);
>>
>>      arma::cx_mat in_mat = arma::zeros<arma::cx_mat>(matrix_size_rows, 
>> matrix_size_cols),
>>                      out_mat = arma::zeros<arma::cx_mat>(matrix_size_rows, 
>> matrix_size_cols);
>>      arma::cx_rowvec matrix_vec = 
>> arma::conv_to<arma::cx_rowvec>::from(arma::linspace<arma::rowvec>(0, 
>> matrix_size_cols - 1, matrix_size_cols));
>>      in_mat.each_row([&](arma::cx_rowvec &a){
>>              a = matrix_vec;
>>      });
>>
>>      Mat petsc_in_mat, petsc_out_mat;
>>      arma::cx_mat petsc_out_comparison_mat = 
>> arma::zeros<arma::cx_mat>(matrix_size_rows, matrix_size_cols);
>>      MatCreateDense(PETSC_COMM_WORLD, PETSC_DECIDE, PETSC_DECIDE, 
>> matrix_size_rows, matrix_size_cols, NULL, &petsc_in_mat);
>>      MatCreateDense(PETSC_COMM_WORLD, PETSC_DECIDE, PETSC_DECIDE, 
>> matrix_size_rows, matrix_size_cols, NULL, &petsc_out_mat);
>>
>>      create_data_in_PETSc_from_scratch (in_mat, petsc_in_mat);
>>      MatZeroEntries(petsc_out_mat);
>>
>>      const std::complex<double> scaling_factor{1. / matrix_size_cols, 0.};
>>
>>      test_scaling_arma (in_mat, out_mat, scaling_factor);
>>      test_scaling_petsc (petsc_in_mat, petsc_out_mat, scaling_factor);
>>
>>      //Benchmark
>>      auto t1 = std::chrono::high_resolution_clock::now();
>>      for(size_t i = 0; i < bench_rounds; ++i) {
>>              test_scaling_arma (in_mat, out_mat, scaling_factor);
>>      }
>>      auto t2 = std::chrono::high_resolution_clock::now();
>>
>>      auto t3 = std::chrono::high_resolution_clock::now();
>>      for(size_t i = 0; i < bench_rounds; ++i) {
>>              test_scaling_petsc_pointer (petsc_in_mat, petsc_out_mat, 
>> scaling_factor);
>>      }
>>      auto t4 = std::chrono::high_resolution_clock::now();
>>
>>      auto t5 = std::chrono::high_resolution_clock::now();
>>      for(size_t i = 0; i < bench_rounds; ++i) {
>>              test_scaling_petsc (petsc_in_mat, petsc_out_mat, 
>> scaling_factor);
>>      }
>>      auto t6 = std::chrono::high_resolution_clock::now();
>>
>>
>>      retrieve_data_from_PETSc (petsc_out_mat, petsc_out_comparison_mat, 
>> matrix_size_cols, matrix_size_rows);
>>
>>      if(print_matrices && rank == 0) {
>>              std::cout << "In-matrix, ARMA:\n" << in_mat
>>                                << "\n\nOut-matrix, ARMA:\n" << out_mat
>>                                << "\n\nComparison-out-matrix, ARMA:\n" << 
>> petsc_out_comparison_mat
>>                                << "\n\nDifference: \n" << 
>> arma::abs(petsc_out_comparison_mat - out_mat)
>>                                <<'\n';
>>      }
>>      if(rank == 0) {
>>              std::cout << "Arma and PETSc/MatScale are equal:\t\t\t\t\t" << 
>> arma::approx_equal(out_mat, petsc_out_comparison_mat, "reldiff", 1e-8) << 
>> '\n';
>>              std::cout << "Arma-time for a matrix size of ["
>> << matrix_size_rows << ", "
>> << matrix_size_cols << "]:\t\t\t\t"
>> << std::chrono::duration_cast<std::chrono::milliseconds>( t2 - t1 ).count() 
>> << '\n';
>>              std::cout << "PETSc-time, pointer for a matrix size of ["
>> << matrix_size_rows << ", "
>> << matrix_size_cols << "]:\t\t\t"
>> << std::chrono::duration_cast<std::chrono::milliseconds>( t4 - t3 ).count() 
>> << '\n';
>>              std::cout << "PETSc-time, MatScale for a matrix size of ["
>> << matrix_size_rows << ", "
>> << matrix_size_cols << "]:\t\t\t"
>> << std::chrono::duration_cast<std::chrono::milliseconds>( t6 - t5 ).count() 
>> << '\n';
>>      }
>>      MatDestroy (&petsc_in_mat);
>>      MatDestroy (&petsc_out_mat);
>> }
>> #include <helper_functions.hpp>
>>
>> void retrieve_data_from_PETSc(const Mat petsc_mat, arma::cx_mat &out_data,
>>                                                        const arma::uword 
>> Ntime, const arma::uword Nradius) {
>>      PetscMPIInt size;
>>      MPI_Comm_size(PETSC_COMM_WORLD, &size);
>>      if(out_data.n_rows != Ntime && out_data.n_cols != Nradius) {
>>              out_data = arma::zeros<arma::cx_mat>(Ntime, Nradius);
>>      }
>>      Mat local_mat;
>>      arma::Col<int> vector_indices_radius = 
>> arma::linspace<arma::Col<int>>(0, Nradius - 1, Nradius);
>>      arma::Col<int> vector_indices_time = arma::linspace<arma::Col<int>>(0, 
>> Ntime - 1, Ntime);
>>      //MatCreateRedundantMatrix(petsc_mat, Ntime * Nradius, MPI_COMM_NULL, 
>> MAT_INITIAL_MATRIX, &local_mat);
>>      MatCreateRedundantMatrix(petsc_mat, size, MPI_COMM_NULL, 
>> MAT_INITIAL_MATRIX, &local_mat);
>>      MatAssemblyBegin(local_mat, MAT_FINAL_ASSEMBLY);
>>      MatAssemblyEnd(local_mat, MAT_FINAL_ASSEMBLY);
>>      MatGetValues(local_mat, Nradius, vector_indices_radius.memptr(), Ntime, 
>> vector_indices_time.memptr(), out_data.memptr());
>>      MatDestroy(&local_mat);
>>      out_data = out_data.st();
>> }
>>
>> void store_data_in_PETSc(const arma::cx_mat &in_data, Mat &petsc_mat) {
>>      const arma::uword Ntime = in_data.n_cols;
>>      const arma::uword Nradius = in_data.n_rows;
>>      arma::Col<int> vector_indices_radius = 
>> arma::linspace<arma::Col<int>>(0, Nradius - 1, Nradius);
>>      arma::Col<int> vector_indices_time = arma::linspace<arma::Col<int>>(0, 
>> Ntime - 1, Ntime);
>>      MatZeroEntries(petsc_mat);
>>      MatAssemblyBegin(petsc_mat, MAT_FINAL_ASSEMBLY);
>>      MatAssemblyEnd(petsc_mat, MAT_FINAL_ASSEMBLY);
>>      arma::cx_mat local_mat = in_data.st();
>>      MatSetValues(petsc_mat, Nradius, vector_indices_radius.memptr(), Ntime, 
>> vector_indices_time.memptr(), local_mat.memptr(), INSERT_VALUES);
>>      MatAssemblyBegin(petsc_mat, MAT_FINAL_ASSEMBLY);
>>      MatAssemblyEnd(petsc_mat, MAT_FINAL_ASSEMBLY);
>> }
>>
>> void create_data_in_PETSc_from_scratch(const arma::cx_mat &in_data, Mat 
>> &petsc_mat) {
>>      const arma::uword Ntime = in_data.n_cols;
>>      const arma::uword Nradius = in_data.n_rows;
>>      MatZeroEntries(petsc_mat);
>>      MatAssemblyBegin(petsc_mat, MAT_FINAL_ASSEMBLY);
>>      MatAssemblyEnd(petsc_mat, MAT_FINAL_ASSEMBLY);
>>      for(int i = 0; i < (int)Ntime; ++i){
>>              for(int j = 0; j < (int)Nradius; ++j) {
>>                      MatSetValue(petsc_mat, j, i, i, INSERT_VALUES);
>>              }
>>      }
>>      MatAssemblyBegin(petsc_mat, MAT_FINAL_ASSEMBLY);
>>      MatAssemblyEnd(petsc_mat, MAT_FINAL_ASSEMBLY);
>> }
>> #ifndef TEST_SCALING_HPP
>> #define TEST_SCALING_HPP
>>
>> #include <helper_functions.hpp>
>>
>> void test_scaling_arma(const arma::cx_mat & in_mat,
>>                                         arma::cx_mat &out_mat,
>>                                         const arma::cx_double 
>> &scaling_factor);
>>
>> void test_scaling_petsc(const Mat &in_mat,
>>                                              Mat &out_mat,
>>                                              const PetscScalar 
>> &scaling_factor);
>>
>> void test_scaling_petsc_pointer(const Mat &in_mat,
>>                                                              Mat &out_mat,
>>                                                              const 
>> PetscScalar &scaling_factor);
>>
>> void test_scaling(const size_t matrix_size_rows, const size_t 
>> matrix_size_cols, const bool print_matrices);
>>
>> #endif // TEST_SCALING_HPP
>> #ifndef HELPER_FUNCTIONS_HPP
>> #define HELPER_FUNCTIONS_HPP
>>
>> #include <iostream>
>>
>> #include <armadillo>
>> #include <petscts.h>
>> #include <petscmat.h>
>> #include <petscvec.h>
>> #include <fftw3.h>
>> #include <fftw3-mpi.h>
>> #include <metis.h>
>> #include <chrono>
>>
>> #include <boost/numeric/odeint.hpp>
>>
>> constexpr int bench_rounds = 1000;
>>
>> void retrieve_data_from_PETSc(const Mat petsc_mat, arma::cx_mat &out_data,
>>                                                        const arma::uword 
>> Ntime, const arma::uword Nradius);
>>
>> void store_data_in_PETSc(const arma::cx_mat &in_data, Mat &petsc_mat);
>>
>> void create_data_in_PETSc_from_scratch(const arma::cx_mat &in_data, Mat 
>> &petsc_mat);
>>
>> #endif // HELPER_FUNCTIONS_HPP

Reply via email to