Thanks a lot Pierre! I managed to solve my problem for now using the (less scalable) solution that you provided. I also opened up an issue about the missing MatMPIAIJGetLocalMat() petsc4py binding here https://gitlab.com/petsc/petsc/-/issues/1443 and if no action is taken, I might take care of it myself as well, soon.
For the sake of completeness, and to help any potential PETSc user that might run into the same issue, here is my final code that works nicely in parallel. """Experimenting with PETSc mat-mat multiplication""" import numpy as np from firedrake import COMM_WORLD from firedrake.petsc import PETSc from numpy.testing import assert_array_almost_equal # import pdb nproc = COMM_WORLD.size rank = COMM_WORLD.rank def Print(x: str): """Prints the string only on the root process Args: x (str): String to be printed """ PETSc.Sys.Print(x) def print_mat_info(mat, name): """Prints the matrix information Args: mat (PETSc mat): PETSc matrix name (string): Name of the matrix """ Print(f"MATRIX {name} [{mat.getSize()[0]}x{mat.getSize()[1]}]") # print(f"For rank {rank} local {name}: {mat.getSizes()}") Print(mat.getType()) mat.view() Print("") COMM_WORLD.Barrier() Print("") def create_petsc_matrix_seq(input_array): """Building a sequential PETSc matrix from an array Args: input_array (np array): Input array Returns: seq mat: PETSc matrix """ assert len(input_array.shape) == 2 m, n = input_array.shape matrix = PETSc.Mat().createAIJ(size=(m, n), comm=PETSc.COMM_SELF) matrix.setUp() matrix.setValues(range(m), range(n), input_array, addv=False) # Assembly the matrix to compute the final structure matrix.assemblyBegin() matrix.assemblyEnd() return matrix def create_petsc_matrix(input_array, sparse=True): """Create a PETSc matrix from an input_array Args: input_array (np array): Input array partition_like (PETSc mat, optional): Petsc matrix. Defaults to None. sparse (bool, optional): Toggle for sparese or dense. Defaults to True. Returns: PETSc mat: PETSc matrix """ # Check if input_array is 1D and reshape if necessary assert len(input_array.shape) == 2, "Input array should be 2-dimensional" global_rows, global_cols = input_array.shape size = ((None, global_rows), (global_cols, global_cols)) # Create a sparse or dense matrix based on the 'sparse' argument if sparse: matrix = PETSc.Mat().createAIJ(size=size, comm=COMM_WORLD) else: matrix = PETSc.Mat().createDense(size=size, comm=COMM_WORLD) matrix.setUp() local_rows_start, local_rows_end = matrix.getOwnershipRange() for counter, i in enumerate(range(local_rows_start, local_rows_end)): # Calculate the correct row in the array for the current process row_in_array = counter + local_rows_start matrix.setValues( i, range(global_cols), input_array[row_in_array, :], addv=False ) # Assembly the matrix to compute the final structure matrix.assemblyBegin() matrix.assemblyEnd() return matrix def get_local_submatrix(A): """Get the local submatrix of A Args: A (mpi PETSc mat): partitioned PETSc matrix Returns: seq mat: PETSc matrix """ local_rows_start, local_rows_end = A.getOwnershipRange() local_rows = local_rows_end - local_rows_start comm = A.getComm() rows = PETSc.IS().createStride( local_rows, first=local_rows_start, step=1, comm=comm ) _, k = A.getSize() # Get the number of columns (k) from A's size cols = PETSc.IS().createStride(k, first=0, step=1, comm=comm) # Getting the local submatrix # TODO: To be replaced by MatMPIAIJGetLocalMat() in the future (see petsc-users mailing list). There is a missing petsc4py binding, need to add it myself (and please create a merge request) A_local = A.createSubMatrices(rows, cols)[0] return A_local def multiply_sequential_matrices(A_seq, B_seq): """Multiply 2 sequential matrices Args: A_seq (seqaij): local submatrix of A B_seq (seqaij): sequential matrix B Returns: seq mat: PETSc matrix that is the product of A_seq and B_seq """ _, A_seq_cols = A_seq.getSize() B_seq_rows, _ = B_seq.getSize() assert ( A_seq_cols == B_seq_rows ), f"Incompatible matrix sizes for multiplication: {A_seq_cols} != {B_seq_rows}" C_local = A_seq.matMult(B_seq) return C_local def create_global_matrix(C_local, A): """Create the global matrix C from the local submatrix C_local Args: C_local (seqaij): local submatrix of C A (mpi PETSc mat): PETSc matrix A Returns: mpi PETSc mat: partitioned PETSc matrix C """ C_local_rows, C_local_cols = C_local.getSize() local_rows_start, _ = A.getOwnershipRange() m, _ = A.getSize() C = PETSc.Mat().createAIJ( size=((None, m), (C_local_cols, C_local_cols)), comm=COMM_WORLD ) C.setUp() for i in range(C_local_rows): cols, values = C_local.getRow(i) global_row = i + local_rows_start C.setValues(global_row, cols, values) C.assemblyBegin() C.assemblyEnd() return C m, k = 11, 7 # Generate the random numpy matrices np.random.seed(0) # sets the seed to 0 A_np = np.random.randint(low=0, high=6, size=(m, k)) B_np = np.random.randint(low=0, high=6, size=(k, k)) # Create B as a sequential matrix on each process B_seq = create_petsc_matrix_seq(B_np) print_mat_info(B_seq, "B") A = create_petsc_matrix(A_np) print_mat_info(A, "A") # Getting the correct local submatrix to be multiplied by B_seq A_local = get_local_submatrix(A) # Multiplication of 2 sequential matrices C_local = multiply_sequential_matrices(A_local, B_seq) # Creating the global C matrix C = create_global_matrix(C_local, A) print_mat_info(C, "C") # -------------------------------------------- # TEST: Multiplication of 2 numpy matrices # -------------------------------------------- AB_np = np.dot(A_np, B_np) Print(f"MATRIX AB_np [{AB_np.shape[0]}x{AB_np.shape[1]}]") Print(AB_np) # Get the local values from C local_rows_start, local_rows_end = C.getOwnershipRange() C_local = C.getValues(range(local_rows_start, local_rows_end), range(k)) # Assert the correctness of the multiplication for the local subset assert_array_almost_equal(C_local, AB_np[local_rows_start:local_rows_end, :], decimal=5) > On 23 Aug 2023, at 14:40, Pierre Jolivet <pierre.joli...@lip6.fr> wrote: > > > >> On 23 Aug 2023, at 8:59 PM, Thanasis Boutsikakis >> <thanasis.boutsika...@corintis.com> wrote: >> >> Thanks for the clarification Mark. >> >> I have tried such an implementation but I since I could not find the >> equivalent of https://petsc.org/main/manualpages/Mat/MatMPIAIJGetLocalMat/ >> for petsc4py, I used A.getLocalSubMatrix to do so, which returns a >> ‘localref’ object that I cannot then use to get my local 'seqaij’ matrix. > > You really need MatMPIAIJGetLocalMat(). > It seems there is a missing petsc4py binding, either add it yourself (and > please create a merge request), or post an issue on GitLab and hope that > someone adds it. > Another way to bypass the issue would be to call MatCreateSubMatrices() with > an is_row set to the local rows, and an is_col set to all columns, but this > will be less scalable than MatMPIAIJGetLocalMat(). > > Thanks, > Pierre > >> I also tried A.getDenseLocalMatrix() but it seems not to be suitable for my >> case. I couldn’t find an example in the petsc4py source code or something >> relevant, do you have any ideas? >> >> """Experimenting with petsc mat-mat multiplication""" >> # import pdb >> >> import numpy as np >> from firedrake import COMM_WORLD >> from firedrake.petsc import PETSc >> from numpy.testing import assert_array_almost_equal >> import pdb >> >> nproc = COMM_WORLD.size >> rank = COMM_WORLD.rank >> >> >> def Print(x: str): >> """Prints the string only on the root process >> >> Args: >> x (str): String to be printed >> """ >> PETSc.Sys.Print(x) >> >> >> def create_petsc_matrix_seq(input_array): >> """Building a sequential petsc matrix from an array >> >> Args: >> input_array (np array): Input array >> >> Returns: >> seq mat: PETSc matrix >> """ >> assert len(input_array.shape) == 2 >> >> m, n = input_array.shape >> matrix = PETSc.Mat().createAIJ(size=(m, n), comm=PETSc.COMM_SELF) >> matrix.setUp() >> >> matrix.setValues(range(m), range(n), input_array, addv=False) >> >> # Assembly the matrix to compute the final structure >> matrix.assemblyBegin() >> matrix.assemblyEnd() >> >> return matrix >> >> >> def create_petsc_matrix(input_array, partition_like=None): >> """Create a PETSc matrix from an input_array >> >> Args: >> input_array (np array): Input array >> partition_like (petsc mat, optional): Petsc matrix. Defaults to None. >> sparse (bool, optional): Toggle for sparese or dense. Defaults to >> True. >> >> Returns: >> petsc mat: PETSc matrix >> """ >> # Check if input_array is 1D and reshape if necessary >> assert len(input_array.shape) == 2, "Input array should be 2-dimensional" >> global_rows, global_cols = input_array.shape >> >> comm = COMM_WORLD >> if partition_like is not None: >> local_rows_start, local_rows_end = partition_like.getOwnershipRange() >> local_rows = local_rows_end - local_rows_start >> >> # No parallelization in the columns, set local_cols = None to >> parallelize >> size = ((local_rows, global_rows), (global_cols, global_cols)) >> else: >> size = ((None, global_rows), (global_cols, global_cols)) >> >> matrix = PETSc.Mat().createAIJ(size=size, comm=comm) >> matrix.setUp() >> >> local_rows_start, local_rows_end = matrix.getOwnershipRange() >> >> for counter, i in enumerate(range(local_rows_start, local_rows_end)): >> # Calculate the correct row in the array for the current process >> row_in_array = counter + local_rows_start >> matrix.setValues( >> i, range(global_cols), input_array[row_in_array, :], addv=False >> ) >> >> # Assembly the matrix to compute the final structure >> matrix.assemblyBegin() >> matrix.assemblyEnd() >> >> return matrix >> >> >> m, k = 10, 3 >> # Generate the random numpy matrices >> np.random.seed(0) # sets the seed to 0 >> A_np = np.random.randint(low=0, high=6, size=(m, k)) >> B_np = np.random.randint(low=0, high=6, size=(k, k)) >> >> # Create B as a sequential matrix on each process >> B_seq = create_petsc_matrix_seq(B_np) >> >> Print(B_seq.getType()) >> Print(B_seq.getSizes()) >> >> A = create_petsc_matrix(A_np) >> >> print("A type:", A.getType()) >> print("A sizes:", A.getSizes()) >> print("A local ownership range:", A.getOwnershipRange()) >> >> # pdb.set_trace() >> >> # Create a local sequential matrix for A using the local submatrix >> local_rows_start, local_rows_end = A.getOwnershipRange() >> local_rows = local_rows_end - local_rows_start >> >> print("local_rows_start:", local_rows_start) >> print("local_rows_end:", local_rows_end) >> print("local_rows:", local_rows) >> >> local_A = PETSc.Mat().createAIJ(size=(local_rows, k), comm=PETSc.COMM_SELF) >> >> # pdb.set_trace() >> >> comm = A.getComm() >> rows = PETSc.IS().createStride(local_rows, first=0, step=1, comm=comm) >> cols = PETSc.IS().createStride(k, first=0, step=1, comm=comm) >> >> print("rows indices:", rows.getIndices()) >> print("cols indices:", cols.getIndices()) >> >> # pdb.set_trace() >> >> # Create the local to global mapping for rows and columns >> l2g_rows = PETSc.LGMap().create(rows.getIndices(), comm=comm) >> l2g_cols = PETSc.LGMap().create(cols.getIndices(), comm=comm) >> >> print("l2g_rows type:", type(l2g_rows)) >> print("l2g_rows:", l2g_rows.view()) >> print("l2g_rows type:", type(l2g_cols)) >> print("l2g_cols:", l2g_cols.view()) >> >> # pdb.set_trace() >> >> # Set the local-to-global mapping for the matrix >> A.setLGMap(l2g_rows, l2g_cols) >> >> # pdb.set_trace() >> >> # Now you can get the local submatrix >> local_A = A.getLocalSubMatrix(rows, cols) >> >> # Assembly the matrix to compute the final structure >> local_A.assemblyBegin() >> local_A.assemblyEnd() >> >> Print(local_A.getType()) >> Print(local_A.getSizes()) >> >> # pdb.set_trace() >> >> # Multiply the two matrices >> local_C = local_A.matMult(B_seq) >> >> >>> On 23 Aug 2023, at 12:56, Mark Adams <mfad...@lbl.gov> wrote: >>> >>> >>> >>> On Wed, Aug 23, 2023 at 5:36 AM Thanasis Boutsikakis >>> <thanasis.boutsika...@corintis.com >>> <mailto:thanasis.boutsika...@corintis.com>> wrote: >>>> Thanks for the suggestion Pierre. >>>> >>>> Yes B is duplicated by all processes. >>>> >>>> In this case, should B be created as a sequential sparse matrix using >>>> COMM_SELF? >>> >>> Yes, that is what Pierre said, >>> >>> Mark >>> >>>> I guess if not, the multiplication of B with the output of >>>> https://petsc.org/main/manualpages/Mat/MatMPIAIJGetLocalMat/ would not go >>>> through, right? >>>> >>>> Thanks, >>>> Thanos >>>> >>>>> On 23 Aug 2023, at 10:47, Pierre Jolivet <pierre.joli...@lip6.fr >>>>> <mailto:pierre.joli...@lip6.fr>> wrote: >>>>> >>>>> >>>>> >>>>>> On 23 Aug 2023, at 5:35 PM, Thanasis Boutsikakis >>>>>> <thanasis.boutsika...@corintis.com >>>>>> <mailto:thanasis.boutsika...@corintis.com>> wrote: >>>>>> >>>>>> Hi all, >>>>>> >>>>>> I am trying to multiply two Petsc matrices as C = A * B, where A is a >>>>>> tall matrix and B is a relatively small matrix. >>>>>> >>>>>> I have taken the decision to create A as (row-)partitioned matrix and B >>>>>> as a non-partitioned matrix that it is entirely shared by all procs (to >>>>>> avoid unnecessary communication). >>>>>> >>>>>> Here is my code: >>>>>> >>>>>> import numpy as np >>>>>> from firedrake import COMM_WORLD >>>>>> from firedrake.petsc import PETSc >>>>>> from numpy.testing import assert_array_almost_equal >>>>>> >>>>>> nproc = COMM_WORLD.size >>>>>> rank = COMM_WORLD.rank >>>>>> >>>>>> def create_petsc_matrix_non_partitioned(input_array): >>>>>> """Building a mpi non-partitioned petsc matrix from an array >>>>>> >>>>>> Args: >>>>>> input_array (np array): Input array >>>>>> sparse (bool, optional): Toggle for sparese or dense. Defaults >>>>>> to True. >>>>>> >>>>>> Returns: >>>>>> mpi mat: PETSc matrix >>>>>> """ >>>>>> assert len(input_array.shape) == 2 >>>>>> >>>>>> m, n = input_array.shape >>>>>> >>>>>> matrix = PETSc.Mat().createAIJ(size=((m, n), (m, n)), >>>>>> comm=COMM_WORLD) >>>>>> >>>>>> # Set the values of the matrix >>>>>> matrix.setValues(range(m), range(n), input_array[:, :], addv=False) >>>>>> >>>>>> # Assembly the matrix to compute the final structure >>>>>> matrix.assemblyBegin() >>>>>> matrix.assemblyEnd() >>>>>> >>>>>> return matrix >>>>>> >>>>>> >>>>>> def create_petsc_matrix(input_array, partition_like=None): >>>>>> """Create a PETSc matrix from an input_array >>>>>> >>>>>> Args: >>>>>> input_array (np array): Input array >>>>>> partition_like (petsc mat, optional): Petsc matrix. Defaults to >>>>>> None. >>>>>> sparse (bool, optional): Toggle for sparese or dense. Defaults >>>>>> to True. >>>>>> >>>>>> Returns: >>>>>> petsc mat: PETSc matrix >>>>>> """ >>>>>> # Check if input_array is 1D and reshape if necessary >>>>>> assert len(input_array.shape) == 2, "Input array should be >>>>>> 2-dimensional" >>>>>> global_rows, global_cols = input_array.shape >>>>>> >>>>>> comm = COMM_WORLD >>>>>> if partition_like is not None: >>>>>> local_rows_start, local_rows_end = >>>>>> partition_like.getOwnershipRange() >>>>>> local_rows = local_rows_end - local_rows_start >>>>>> >>>>>> # No parallelization in the columns, set local_cols = None to >>>>>> parallelize >>>>>> size = ((local_rows, global_rows), (global_cols, global_cols)) >>>>>> else: >>>>>> size = ((None, global_rows), (global_cols, global_cols)) >>>>>> >>>>>> matrix = PETSc.Mat().createAIJ(size=size, comm=comm) >>>>>> matrix.setUp() >>>>>> >>>>>> local_rows_start, local_rows_end = matrix.getOwnershipRange() >>>>>> >>>>>> for counter, i in enumerate(range(local_rows_start, local_rows_end)): >>>>>> # Calculate the correct row in the array for the current process >>>>>> row_in_array = counter + local_rows_start >>>>>> matrix.setValues( >>>>>> i, range(global_cols), input_array[row_in_array, :], >>>>>> addv=False >>>>>> ) >>>>>> >>>>>> # Assembly the matrix to compute the final structure >>>>>> matrix.assemblyBegin() >>>>>> matrix.assemblyEnd() >>>>>> >>>>>> return matrix >>>>>> >>>>>> >>>>>> m, k = 10, 3 >>>>>> # Generate the random numpy matrices >>>>>> np.random.seed(0) # sets the seed to 0 >>>>>> A_np = np.random.randint(low=0, high=6, size=(m, k)) >>>>>> B_np = np.random.randint(low=0, high=6, size=(k, k)) >>>>>> >>>>>> >>>>>> A = create_petsc_matrix(A_np) >>>>>> >>>>>> B = create_petsc_matrix_non_partitioned(B_np) >>>>>> >>>>>> # Now perform the multiplication >>>>>> C = A * B >>>>>> >>>>>> The problem with this is that there is a mismatch between the local rows >>>>>> of A (depend on the partitioning) and the global rows of B (3 for all >>>>>> procs), so that the multiplication cannot happen in parallel. Here is >>>>>> the error: >>>>>> >>>>>> [0]PETSC ERROR: >>>>>> ------------------------------------------------------------------------ >>>>>> [0]PETSC ERROR: Caught signal number 11 SEGV: Segmentation Violation, >>>>>> probably memory access out of range >>>>>> [0]PETSC ERROR: Try option -start_in_debugger or >>>>>> -on_error_attach_debugger >>>>>> [0]PETSC ERROR: or see https://petsc.org/release/faq/#valgrind and >>>>>> https://petsc.org/release/faq/ >>>>>> [0]PETSC ERROR: configure using --with-debugging=yes, recompile, link, >>>>>> and run >>>>>> [0]PETSC ERROR: to get more information on the crash. >>>>>> [0]PETSC ERROR: Run with -malloc_debug to check if memory corruption is >>>>>> causing the crash. >>>>>> application called MPI_Abort(MPI_COMM_WORLD, 59) - process 0 >>>>>> [unset]: write_line error; fd=-1 buf=:cmd=abort exitcode=59 >>>>>> : >>>>>> system msg for write_line failure : Bad file descriptor >>>>>> >>>>>> >>>>>> Is there a standard way to achieve this? >>>>> >>>>> Your B is duplicated by all processes? >>>>> If so, then, call >>>>> https://petsc.org/main/manualpages/Mat/MatMPIAIJGetLocalMat/, do a >>>>> sequential product with B on COMM_SELF, not COMM_WORLD, and use >>>>> https://petsc.org/main/manualpages/Mat/MatCreateMPIMatConcatenateSeqMat/ >>>>> with the output. >>>>> >>>>> Thanks, >>>>> Pierre >>>>> >>>>>> Thanks, >>>>>> Thanos >>>>>> >>>>>> >>>> >> >