Hei,

when compiling the attached files using the following compilation line

//usr/lib64/mpi/gcc/openmpi3/bin/mpicxx -DBOOST_ALL_NO_LIB
-DBOOST_FILESYSTEM_DYN_LINK -DBOOST_MPI_DYN_LINK
-DBOOST_PROGRAM_OPTIONS_DYN_LINK -DBOOST_SERIALIZATION_DYN_LINK
-DUSE_CUDA
-I/home/roland/Dokumente/C++-Projekte/armadillo_with_PETSc/include
-I/opt/intel/compilers_and_libraries_2020.2.254/linux/mkl/include
-I/opt/armadillo/include -isystem /opt/petsc_release/include -isystem
/opt/fftw3/include -isystem /opt/boost/include -march=native
-fopenmp-simd -DMKL_LP64 -m64 -Wall -Wextra -pedantic -fPIC -flto -O2
-funroll-loops -funroll-all-loops -fstrict-aliasing -mavx -march=native
-fopenmp -std=gnu++17 -c <source_files> -o <target_files>/

and linking them with 

//usr/lib64/mpi/gcc/openmpi3/bin/mpicxx  -march=native -fopenmp-simd
-DMKL_LP64 -m64 <target_files> -o bin/armadillo_with_PETSc 
-Wl,-rpath,/opt/boost/lib:/opt/fftw3/lib64:/opt/petsc_release/lib
/usr/lib64/libgsl.so /usr/lib64/libgslcblas.so -lgfortran
/opt/intel/mkl/lib/intel64/libmkl_rt.so
/opt/boost/lib/libboost_filesystem.so.1.72.0
/opt/boost/lib/libboost_mpi.so.1.72.0
/opt/boost/lib/libboost_program_options.so.1.72.0
/opt/boost/lib/libboost_serialization.so.1.72.0
/opt/fftw3/lib64/libfftw3.so /opt/fftw3/lib64/libfftw3_mpi.so
/opt/petsc_release/lib/libpetsc.so
/usr/lib64/gcc/x86_64-suse-linux/9/libgomp.so/

my output is 

/Arma and PETSc/MatScale are equal:                                     
        0//
//Arma-time for a matrix size of [1024,
8192]:                                      24955//
//PETSc-time, pointer for a matrix size of [1024,
8192]:                      28283//
//PETSc-time, MatScale for a matrix size of [1024,
8192]:                  23138/

but when removing the explicit call to openmp (i.e. removing /-fopenmp/
and //usr/lib64/gcc/x86_64-suse-linux/9/libgomp.so/) my result is

/Arma and PETSc/MatScale are equal:                                     
       1//
//Arma-time for a matrix size of [1024,
8192]:                                     24878//
//PETSc-time, pointer for a matrix size of [1024,
8192]:                     18942//
//PETSc-time, MatScale for a matrix size of [1024,
8192]:                 23350/

even though both times the executable is linked to

/        libmkl_intel_lp64.so =>
/opt/intel/compilers_and_libraries_2020.2.254/linux/mkl/lib/intel64_lin/libmkl_intel_lp64.so
(0x00007f9eebd70000)//
//        libmkl_core.so =>
/opt/intel/compilers_and_libraries_2020.2.254/linux/mkl/lib/intel64_lin/libmkl_core.so
(0x00007f9ee77aa000)//
//        libmkl_intel_thread.so =>
/opt/intel/compilers_and_libraries_2020.2.254/linux/mkl/lib/intel64_lin/libmkl_intel_thread.so
(0x00007f9ee42c3000)//
//        libiomp5.so =>
/opt/intel/compilers_and_libraries_2020.2.254/linux/compiler/lib/intel64_lin/libiomp5.so
(0x00007f9ee3ebd000)//
//        libgomp.so.1 => /usr/lib64/libgomp.so.1 (0x00007f9ea98bd000)/

via the petsc-library. Why does the execution time vary by so much, and
why does my result change when calling MatScale (i.e. returning wrong
results) when explicitly linking to OpenMP?

Thanks!

Regards,

Roland

#include <test_scaling.hpp>

int main(int argc, char **args) {
	PetscMPIInt rank, size;
	PetscInitialize(&argc, &args, (char*) 0, NULL);

	MPI_Comm_size(PETSC_COMM_WORLD, &size);
	MPI_Comm_rank(PETSC_COMM_WORLD, &rank);

	test_scaling (1024, 8192, false);
	PetscFinalize();
	return 0;
}
#include <test_scaling.hpp>

void test_scaling_arma(const arma::cx_mat & in_mat,
					   arma::cx_mat &out_mat,
					   const arma::cx_double &scaling_factor) {
	out_mat = in_mat;
	out_mat *= scaling_factor;
}

void test_scaling_petsc(const Mat &in_mat,
						Mat &out_mat,
						const PetscScalar &scaling_factor) {
	MatZeroEntries(out_mat);
	MatAXPY(out_mat, scaling_factor, in_mat, SAME_NONZERO_PATTERN);
}

void test_scaling_petsc_pointer(const Mat &in_mat,
								Mat &out_mat,
								const PetscScalar &scaling_factor) {
	const PetscScalar *in_mat_ptr;
	PetscScalar *out_mat_ptr;
	MatDenseGetArrayRead (in_mat, &in_mat_ptr);
	MatDenseGetArrayWrite (out_mat, &out_mat_ptr);
	PetscInt r_0, r_1;
	MatGetLocalSize (out_mat, &r_0, &r_1);
#pragma omp parallel for
	for(int i = 0; i < r_0 * r_1; ++i)
		*(out_mat_ptr + i) = (*(in_mat_ptr + i) * scaling_factor);

	MatDenseRestoreArrayRead (in_mat, &in_mat_ptr);
	MatDenseRestoreArrayWrite(out_mat, &out_mat_ptr);
}

void test_scaling(const size_t matrix_size_rows, const size_t matrix_size_cols, const bool print_matrices) {
	PetscMPIInt rank, size;

	MPI_Comm_size(PETSC_COMM_WORLD, &size);
	MPI_Comm_rank(PETSC_COMM_WORLD, &rank);

	arma::cx_mat in_mat = arma::zeros<arma::cx_mat>(matrix_size_rows, matrix_size_cols),
			out_mat = arma::zeros<arma::cx_mat>(matrix_size_rows, matrix_size_cols);
	arma::cx_rowvec matrix_vec = arma::conv_to<arma::cx_rowvec>::from(arma::linspace<arma::rowvec>(0, matrix_size_cols - 1, matrix_size_cols));
	in_mat.each_row([&](arma::cx_rowvec &a){
		a = matrix_vec;
	});

	Mat petsc_in_mat, petsc_out_mat;
	arma::cx_mat petsc_out_comparison_mat = arma::zeros<arma::cx_mat>(matrix_size_rows, matrix_size_cols);
	MatCreateDense(PETSC_COMM_WORLD, PETSC_DECIDE, PETSC_DECIDE, matrix_size_rows, matrix_size_cols, NULL, &petsc_in_mat);
	MatCreateDense(PETSC_COMM_WORLD, PETSC_DECIDE, PETSC_DECIDE, matrix_size_rows, matrix_size_cols, NULL, &petsc_out_mat);

	create_data_in_PETSc_from_scratch (in_mat, petsc_in_mat);
	MatZeroEntries(petsc_out_mat);

	const std::complex<double> scaling_factor{1. / matrix_size_cols, 0.};

	test_scaling_arma (in_mat, out_mat, scaling_factor);
	test_scaling_petsc (petsc_in_mat, petsc_out_mat, scaling_factor);

	//Benchmark
	auto t1 = std::chrono::high_resolution_clock::now();
	for(size_t i = 0; i < bench_rounds; ++i) {
		test_scaling_arma (in_mat, out_mat, scaling_factor);
	}
	auto t2 = std::chrono::high_resolution_clock::now();

	auto t3 = std::chrono::high_resolution_clock::now();
	for(size_t i = 0; i < bench_rounds; ++i) {
		test_scaling_petsc_pointer (petsc_in_mat, petsc_out_mat, scaling_factor);
	}
	auto t4 = std::chrono::high_resolution_clock::now();

	auto t5 = std::chrono::high_resolution_clock::now();
	for(size_t i = 0; i < bench_rounds; ++i) {
		test_scaling_petsc (petsc_in_mat, petsc_out_mat, scaling_factor);
	}
	auto t6 = std::chrono::high_resolution_clock::now();


	retrieve_data_from_PETSc (petsc_out_mat, petsc_out_comparison_mat, matrix_size_cols, matrix_size_rows);

	if(print_matrices && rank == 0) {
		std::cout << "In-matrix, ARMA:\n" << in_mat
				  << "\n\nOut-matrix, ARMA:\n" << out_mat
				  << "\n\nComparison-out-matrix, ARMA:\n" << petsc_out_comparison_mat
				  << "\n\nDifference: \n" << arma::abs(petsc_out_comparison_mat - out_mat)
				  <<'\n';
	}
	if(rank == 0) {
		std::cout << "Arma and PETSc/MatScale are equal:\t\t\t\t\t" << arma::approx_equal(out_mat, petsc_out_comparison_mat, "reldiff", 1e-8) << '\n';
		std::cout << "Arma-time for a matrix size of ["
<< matrix_size_rows << ", "
<< matrix_size_cols << "]:\t\t\t\t"
<< std::chrono::duration_cast<std::chrono::milliseconds>( t2 - t1 ).count() << '\n';
		std::cout << "PETSc-time, pointer for a matrix size of ["
<< matrix_size_rows << ", "
<< matrix_size_cols << "]:\t\t\t"
<< std::chrono::duration_cast<std::chrono::milliseconds>( t4 - t3 ).count() << '\n';
		std::cout << "PETSc-time, MatScale for a matrix size of ["
<< matrix_size_rows << ", "
<< matrix_size_cols << "]:\t\t\t"
<< std::chrono::duration_cast<std::chrono::milliseconds>( t6 - t5 ).count() << '\n';
	}
	MatDestroy (&petsc_in_mat);
	MatDestroy (&petsc_out_mat);
}
#include <helper_functions.hpp>

void retrieve_data_from_PETSc(const Mat petsc_mat, arma::cx_mat &out_data,
							  const arma::uword Ntime, const arma::uword Nradius) {
	PetscMPIInt size;
	MPI_Comm_size(PETSC_COMM_WORLD, &size);
	if(out_data.n_rows != Ntime && out_data.n_cols != Nradius) {
		out_data = arma::zeros<arma::cx_mat>(Ntime, Nradius);
	}
	Mat local_mat;
	arma::Col<int> vector_indices_radius = arma::linspace<arma::Col<int>>(0, Nradius - 1, Nradius);
	arma::Col<int> vector_indices_time = arma::linspace<arma::Col<int>>(0, Ntime - 1, Ntime);
	//MatCreateRedundantMatrix(petsc_mat, Ntime * Nradius, MPI_COMM_NULL, MAT_INITIAL_MATRIX, &local_mat);
	MatCreateRedundantMatrix(petsc_mat, size, MPI_COMM_NULL, MAT_INITIAL_MATRIX, &local_mat);
	MatAssemblyBegin(local_mat, MAT_FINAL_ASSEMBLY);
	MatAssemblyEnd(local_mat, MAT_FINAL_ASSEMBLY);
	MatGetValues(local_mat, Nradius, vector_indices_radius.memptr(), Ntime, vector_indices_time.memptr(), out_data.memptr());
	MatDestroy(&local_mat);
	out_data = out_data.st();
}

void store_data_in_PETSc(const arma::cx_mat &in_data, Mat &petsc_mat) {
	const arma::uword Ntime = in_data.n_cols;
	const arma::uword Nradius = in_data.n_rows;
	arma::Col<int> vector_indices_radius = arma::linspace<arma::Col<int>>(0, Nradius - 1, Nradius);
	arma::Col<int> vector_indices_time = arma::linspace<arma::Col<int>>(0, Ntime - 1, Ntime);
	MatZeroEntries(petsc_mat);
	MatAssemblyBegin(petsc_mat, MAT_FINAL_ASSEMBLY);
	MatAssemblyEnd(petsc_mat, MAT_FINAL_ASSEMBLY);
	arma::cx_mat local_mat = in_data.st();
	MatSetValues(petsc_mat, Nradius, vector_indices_radius.memptr(), Ntime, vector_indices_time.memptr(), local_mat.memptr(), INSERT_VALUES);
	MatAssemblyBegin(petsc_mat, MAT_FINAL_ASSEMBLY);
	MatAssemblyEnd(petsc_mat, MAT_FINAL_ASSEMBLY);
}

void create_data_in_PETSc_from_scratch(const arma::cx_mat &in_data, Mat &petsc_mat) {
	const arma::uword Ntime = in_data.n_cols;
	const arma::uword Nradius = in_data.n_rows;
	MatZeroEntries(petsc_mat);
	MatAssemblyBegin(petsc_mat, MAT_FINAL_ASSEMBLY);
	MatAssemblyEnd(petsc_mat, MAT_FINAL_ASSEMBLY);
	for(int i = 0; i < (int)Ntime; ++i){
		for(int j = 0; j < (int)Nradius; ++j) {
			MatSetValue(petsc_mat, j, i, i, INSERT_VALUES);
		}
	}
	MatAssemblyBegin(petsc_mat, MAT_FINAL_ASSEMBLY);
	MatAssemblyEnd(petsc_mat, MAT_FINAL_ASSEMBLY);
}
#ifndef TEST_SCALING_HPP
#define TEST_SCALING_HPP

#include <helper_functions.hpp>

void test_scaling_arma(const arma::cx_mat & in_mat,
					   arma::cx_mat &out_mat,
					   const arma::cx_double &scaling_factor);

void test_scaling_petsc(const Mat &in_mat,
						Mat &out_mat,
						const PetscScalar &scaling_factor);

void test_scaling_petsc_pointer(const Mat &in_mat,
								Mat &out_mat,
								const PetscScalar &scaling_factor);

void test_scaling(const size_t matrix_size_rows, const size_t matrix_size_cols, const bool print_matrices);

#endif // TEST_SCALING_HPP
#ifndef HELPER_FUNCTIONS_HPP
#define HELPER_FUNCTIONS_HPP

#include <iostream>

#include <armadillo>
#include <petscts.h>
#include <petscmat.h>
#include <petscvec.h>
#include <fftw3.h>
#include <fftw3-mpi.h>
#include <metis.h>
#include <chrono>

#include <boost/numeric/odeint.hpp>

constexpr int bench_rounds = 1000;

void retrieve_data_from_PETSc(const Mat petsc_mat, arma::cx_mat &out_data,
							  const arma::uword Ntime, const arma::uword Nradius);

void store_data_in_PETSc(const arma::cx_mat &in_data, Mat &petsc_mat);

void create_data_in_PETSc_from_scratch(const arma::cx_mat &in_data, Mat &petsc_mat);

#endif // HELPER_FUNCTIONS_HPP

Reply via email to