Hei,
when compiling the attached files using the following compilation line
//usr/lib64/mpi/gcc/openmpi3/bin/mpicxx -DBOOST_ALL_NO_LIB
-DBOOST_FILESYSTEM_DYN_LINK -DBOOST_MPI_DYN_LINK
-DBOOST_PROGRAM_OPTIONS_DYN_LINK -DBOOST_SERIALIZATION_DYN_LINK
-DUSE_CUDA
-I/home/roland/Dokumente/C++-Projekte/armadillo_with_PETSc/include
-I/opt/intel/compilers_and_libraries_2020.2.254/linux/mkl/include
-I/opt/armadillo/include -isystem /opt/petsc_release/include -isystem
/opt/fftw3/include -isystem /opt/boost/include -march=native
-fopenmp-simd -DMKL_LP64 -m64 -Wall -Wextra -pedantic -fPIC -flto -O2
-funroll-loops -funroll-all-loops -fstrict-aliasing -mavx -march=native
-fopenmp -std=gnu++17 -c <source_files> -o <target_files>/
and linking them with
//usr/lib64/mpi/gcc/openmpi3/bin/mpicxx -march=native -fopenmp-simd
-DMKL_LP64 -m64 <target_files> -o bin/armadillo_with_PETSc
-Wl,-rpath,/opt/boost/lib:/opt/fftw3/lib64:/opt/petsc_release/lib
/usr/lib64/libgsl.so /usr/lib64/libgslcblas.so -lgfortran
/opt/intel/mkl/lib/intel64/libmkl_rt.so
/opt/boost/lib/libboost_filesystem.so.1.72.0
/opt/boost/lib/libboost_mpi.so.1.72.0
/opt/boost/lib/libboost_program_options.so.1.72.0
/opt/boost/lib/libboost_serialization.so.1.72.0
/opt/fftw3/lib64/libfftw3.so /opt/fftw3/lib64/libfftw3_mpi.so
/opt/petsc_release/lib/libpetsc.so
/usr/lib64/gcc/x86_64-suse-linux/9/libgomp.so/
my output is
/Arma and PETSc/MatScale are equal:
0//
//Arma-time for a matrix size of [1024,
8192]: 24955//
//PETSc-time, pointer for a matrix size of [1024,
8192]: 28283//
//PETSc-time, MatScale for a matrix size of [1024,
8192]: 23138/
but when removing the explicit call to openmp (i.e. removing /-fopenmp/
and //usr/lib64/gcc/x86_64-suse-linux/9/libgomp.so/) my result is
/Arma and PETSc/MatScale are equal:
1//
//Arma-time for a matrix size of [1024,
8192]: 24878//
//PETSc-time, pointer for a matrix size of [1024,
8192]: 18942//
//PETSc-time, MatScale for a matrix size of [1024,
8192]: 23350/
even though both times the executable is linked to
/ libmkl_intel_lp64.so =>
/opt/intel/compilers_and_libraries_2020.2.254/linux/mkl/lib/intel64_lin/libmkl_intel_lp64.so
(0x00007f9eebd70000)//
// libmkl_core.so =>
/opt/intel/compilers_and_libraries_2020.2.254/linux/mkl/lib/intel64_lin/libmkl_core.so
(0x00007f9ee77aa000)//
// libmkl_intel_thread.so =>
/opt/intel/compilers_and_libraries_2020.2.254/linux/mkl/lib/intel64_lin/libmkl_intel_thread.so
(0x00007f9ee42c3000)//
// libiomp5.so =>
/opt/intel/compilers_and_libraries_2020.2.254/linux/compiler/lib/intel64_lin/libiomp5.so
(0x00007f9ee3ebd000)//
// libgomp.so.1 => /usr/lib64/libgomp.so.1 (0x00007f9ea98bd000)/
via the petsc-library. Why does the execution time vary by so much, and
why does my result change when calling MatScale (i.e. returning wrong
results) when explicitly linking to OpenMP?
Thanks!
Regards,
Roland
#include <test_scaling.hpp>
int main(int argc, char **args) {
PetscMPIInt rank, size;
PetscInitialize(&argc, &args, (char*) 0, NULL);
MPI_Comm_size(PETSC_COMM_WORLD, &size);
MPI_Comm_rank(PETSC_COMM_WORLD, &rank);
test_scaling (1024, 8192, false);
PetscFinalize();
return 0;
}
#include <test_scaling.hpp>
void test_scaling_arma(const arma::cx_mat & in_mat,
arma::cx_mat &out_mat,
const arma::cx_double &scaling_factor) {
out_mat = in_mat;
out_mat *= scaling_factor;
}
void test_scaling_petsc(const Mat &in_mat,
Mat &out_mat,
const PetscScalar &scaling_factor) {
MatZeroEntries(out_mat);
MatAXPY(out_mat, scaling_factor, in_mat, SAME_NONZERO_PATTERN);
}
void test_scaling_petsc_pointer(const Mat &in_mat,
Mat &out_mat,
const PetscScalar &scaling_factor) {
const PetscScalar *in_mat_ptr;
PetscScalar *out_mat_ptr;
MatDenseGetArrayRead (in_mat, &in_mat_ptr);
MatDenseGetArrayWrite (out_mat, &out_mat_ptr);
PetscInt r_0, r_1;
MatGetLocalSize (out_mat, &r_0, &r_1);
#pragma omp parallel for
for(int i = 0; i < r_0 * r_1; ++i)
*(out_mat_ptr + i) = (*(in_mat_ptr + i) * scaling_factor);
MatDenseRestoreArrayRead (in_mat, &in_mat_ptr);
MatDenseRestoreArrayWrite(out_mat, &out_mat_ptr);
}
void test_scaling(const size_t matrix_size_rows, const size_t matrix_size_cols, const bool print_matrices) {
PetscMPIInt rank, size;
MPI_Comm_size(PETSC_COMM_WORLD, &size);
MPI_Comm_rank(PETSC_COMM_WORLD, &rank);
arma::cx_mat in_mat = arma::zeros<arma::cx_mat>(matrix_size_rows, matrix_size_cols),
out_mat = arma::zeros<arma::cx_mat>(matrix_size_rows, matrix_size_cols);
arma::cx_rowvec matrix_vec = arma::conv_to<arma::cx_rowvec>::from(arma::linspace<arma::rowvec>(0, matrix_size_cols - 1, matrix_size_cols));
in_mat.each_row([&](arma::cx_rowvec &a){
a = matrix_vec;
});
Mat petsc_in_mat, petsc_out_mat;
arma::cx_mat petsc_out_comparison_mat = arma::zeros<arma::cx_mat>(matrix_size_rows, matrix_size_cols);
MatCreateDense(PETSC_COMM_WORLD, PETSC_DECIDE, PETSC_DECIDE, matrix_size_rows, matrix_size_cols, NULL, &petsc_in_mat);
MatCreateDense(PETSC_COMM_WORLD, PETSC_DECIDE, PETSC_DECIDE, matrix_size_rows, matrix_size_cols, NULL, &petsc_out_mat);
create_data_in_PETSc_from_scratch (in_mat, petsc_in_mat);
MatZeroEntries(petsc_out_mat);
const std::complex<double> scaling_factor{1. / matrix_size_cols, 0.};
test_scaling_arma (in_mat, out_mat, scaling_factor);
test_scaling_petsc (petsc_in_mat, petsc_out_mat, scaling_factor);
//Benchmark
auto t1 = std::chrono::high_resolution_clock::now();
for(size_t i = 0; i < bench_rounds; ++i) {
test_scaling_arma (in_mat, out_mat, scaling_factor);
}
auto t2 = std::chrono::high_resolution_clock::now();
auto t3 = std::chrono::high_resolution_clock::now();
for(size_t i = 0; i < bench_rounds; ++i) {
test_scaling_petsc_pointer (petsc_in_mat, petsc_out_mat, scaling_factor);
}
auto t4 = std::chrono::high_resolution_clock::now();
auto t5 = std::chrono::high_resolution_clock::now();
for(size_t i = 0; i < bench_rounds; ++i) {
test_scaling_petsc (petsc_in_mat, petsc_out_mat, scaling_factor);
}
auto t6 = std::chrono::high_resolution_clock::now();
retrieve_data_from_PETSc (petsc_out_mat, petsc_out_comparison_mat, matrix_size_cols, matrix_size_rows);
if(print_matrices && rank == 0) {
std::cout << "In-matrix, ARMA:\n" << in_mat
<< "\n\nOut-matrix, ARMA:\n" << out_mat
<< "\n\nComparison-out-matrix, ARMA:\n" << petsc_out_comparison_mat
<< "\n\nDifference: \n" << arma::abs(petsc_out_comparison_mat - out_mat)
<<'\n';
}
if(rank == 0) {
std::cout << "Arma and PETSc/MatScale are equal:\t\t\t\t\t" << arma::approx_equal(out_mat, petsc_out_comparison_mat, "reldiff", 1e-8) << '\n';
std::cout << "Arma-time for a matrix size of ["
<< matrix_size_rows << ", "
<< matrix_size_cols << "]:\t\t\t\t"
<< std::chrono::duration_cast<std::chrono::milliseconds>( t2 - t1 ).count() << '\n';
std::cout << "PETSc-time, pointer for a matrix size of ["
<< matrix_size_rows << ", "
<< matrix_size_cols << "]:\t\t\t"
<< std::chrono::duration_cast<std::chrono::milliseconds>( t4 - t3 ).count() << '\n';
std::cout << "PETSc-time, MatScale for a matrix size of ["
<< matrix_size_rows << ", "
<< matrix_size_cols << "]:\t\t\t"
<< std::chrono::duration_cast<std::chrono::milliseconds>( t6 - t5 ).count() << '\n';
}
MatDestroy (&petsc_in_mat);
MatDestroy (&petsc_out_mat);
}
#include <helper_functions.hpp>
void retrieve_data_from_PETSc(const Mat petsc_mat, arma::cx_mat &out_data,
const arma::uword Ntime, const arma::uword Nradius) {
PetscMPIInt size;
MPI_Comm_size(PETSC_COMM_WORLD, &size);
if(out_data.n_rows != Ntime && out_data.n_cols != Nradius) {
out_data = arma::zeros<arma::cx_mat>(Ntime, Nradius);
}
Mat local_mat;
arma::Col<int> vector_indices_radius = arma::linspace<arma::Col<int>>(0, Nradius - 1, Nradius);
arma::Col<int> vector_indices_time = arma::linspace<arma::Col<int>>(0, Ntime - 1, Ntime);
//MatCreateRedundantMatrix(petsc_mat, Ntime * Nradius, MPI_COMM_NULL, MAT_INITIAL_MATRIX, &local_mat);
MatCreateRedundantMatrix(petsc_mat, size, MPI_COMM_NULL, MAT_INITIAL_MATRIX, &local_mat);
MatAssemblyBegin(local_mat, MAT_FINAL_ASSEMBLY);
MatAssemblyEnd(local_mat, MAT_FINAL_ASSEMBLY);
MatGetValues(local_mat, Nradius, vector_indices_radius.memptr(), Ntime, vector_indices_time.memptr(), out_data.memptr());
MatDestroy(&local_mat);
out_data = out_data.st();
}
void store_data_in_PETSc(const arma::cx_mat &in_data, Mat &petsc_mat) {
const arma::uword Ntime = in_data.n_cols;
const arma::uword Nradius = in_data.n_rows;
arma::Col<int> vector_indices_radius = arma::linspace<arma::Col<int>>(0, Nradius - 1, Nradius);
arma::Col<int> vector_indices_time = arma::linspace<arma::Col<int>>(0, Ntime - 1, Ntime);
MatZeroEntries(petsc_mat);
MatAssemblyBegin(petsc_mat, MAT_FINAL_ASSEMBLY);
MatAssemblyEnd(petsc_mat, MAT_FINAL_ASSEMBLY);
arma::cx_mat local_mat = in_data.st();
MatSetValues(petsc_mat, Nradius, vector_indices_radius.memptr(), Ntime, vector_indices_time.memptr(), local_mat.memptr(), INSERT_VALUES);
MatAssemblyBegin(petsc_mat, MAT_FINAL_ASSEMBLY);
MatAssemblyEnd(petsc_mat, MAT_FINAL_ASSEMBLY);
}
void create_data_in_PETSc_from_scratch(const arma::cx_mat &in_data, Mat &petsc_mat) {
const arma::uword Ntime = in_data.n_cols;
const arma::uword Nradius = in_data.n_rows;
MatZeroEntries(petsc_mat);
MatAssemblyBegin(petsc_mat, MAT_FINAL_ASSEMBLY);
MatAssemblyEnd(petsc_mat, MAT_FINAL_ASSEMBLY);
for(int i = 0; i < (int)Ntime; ++i){
for(int j = 0; j < (int)Nradius; ++j) {
MatSetValue(petsc_mat, j, i, i, INSERT_VALUES);
}
}
MatAssemblyBegin(petsc_mat, MAT_FINAL_ASSEMBLY);
MatAssemblyEnd(petsc_mat, MAT_FINAL_ASSEMBLY);
}
#ifndef TEST_SCALING_HPP
#define TEST_SCALING_HPP
#include <helper_functions.hpp>
void test_scaling_arma(const arma::cx_mat & in_mat,
arma::cx_mat &out_mat,
const arma::cx_double &scaling_factor);
void test_scaling_petsc(const Mat &in_mat,
Mat &out_mat,
const PetscScalar &scaling_factor);
void test_scaling_petsc_pointer(const Mat &in_mat,
Mat &out_mat,
const PetscScalar &scaling_factor);
void test_scaling(const size_t matrix_size_rows, const size_t matrix_size_cols, const bool print_matrices);
#endif // TEST_SCALING_HPP
#ifndef HELPER_FUNCTIONS_HPP
#define HELPER_FUNCTIONS_HPP
#include <iostream>
#include <armadillo>
#include <petscts.h>
#include <petscmat.h>
#include <petscvec.h>
#include <fftw3.h>
#include <fftw3-mpi.h>
#include <metis.h>
#include <chrono>
#include <boost/numeric/odeint.hpp>
constexpr int bench_rounds = 1000;
void retrieve_data_from_PETSc(const Mat petsc_mat, arma::cx_mat &out_data,
const arma::uword Ntime, const arma::uword Nradius);
void store_data_in_PETSc(const arma::cx_mat &in_data, Mat &petsc_mat);
void create_data_in_PETSc_from_scratch(const arma::cx_mat &in_data, Mat &petsc_mat);
#endif // HELPER_FUNCTIONS_HPP