And here is the ex55 diffs: diff --git a/src/ksp/ksp/tutorials/ex55.c b/src/ksp/ksp/tutorials/ex55.c index dded9cae13b..18bdcc8efaf 100644 --- a/src/ksp/ksp/tutorials/ex55.c +++ b/src/ksp/ksp/tutorials/ex55.c @@ -6,6 +6,7 @@ Load of 1.0 in x direction on all nodes (not a true uniform load).\n\ -alpha <v> : scaling of material coefficient in embedded circle\n\n";
#include <petscksp.h> +#include "../../../../src/ksp/pc/impls/gamg/gamg.h" /*I "petscpc.h" I*/ int main(int argc, char **args) { @@ -221,6 +222,30 @@ int main(int argc, char **args) PetscCall(VecSet(xx, .0)); + PC pc; + PetscCall(KSPGetPC(ksp, &pc)); + PC_MG *mg = (PC_MG *)pc->data; + PC_MG_Levels **mglevels = mg->levels; + Mat P = mglevels[mg->nlevels-1]->interpolate; + PetscCall(MatViewFromOptions(mglevels[mg->nlevels-1]->A, NULL, "-rap_mat_view")); + PetscCall(MatViewFromOptions(Amat, NULL, "-rap_mat_view")); + KSP ksp2; + PetscCall(KSPCreate(PETSC_COMM_WORLD, &ksp2)); + PetscCall(KSPSetOptionsPrefix(ksp2, "rap_")); + PetscCall(KSPSetFromOptions(ksp2)); + PetscCall(KSPGetPC(ksp2, &pc)); + PetscCall(KSPSetOperators(ksp2, Amat, Amat)); + PetscCall(PCMGSetGalerkin(pc, PC_MG_GALERKIN_PMAT)); + PetscCall(PCMGSetInterpolation(pc, 1, P)); + PetscCall(VecSet(bb, 1.0)); + PetscCall(PetscLogStagePush(stage[1])); + PetscCall(KSPSolve(ksp2, bb, xx)); + //PetscCall(MatViewFromOptions(mglevels[0]->A, NULL, "-rap_mat_view")); + PetscCall(PetscLogStagePop()); + PetscCall(PetscFinalize()); + exit(12); + + PetscCall(PetscLogStagePush(stage[1])); PetscCall(KSPSolve(ksp, bb, xx)); PetscCall(PetscLogStagePop()); On Tue, Feb 18, 2025 at 11:21 AM Mark Adams <mfad...@lbl.gov> wrote: > And, I forgot that the GAMG coarse grid (and this tiny grid) are forced to > one processor, hence valgrind errors only on one process. > Add: -pc_gamg_parallel_coarse_grid_solver -ne 13 > If you want to see valgrind errors on all 4 processors. > > On Tue, Feb 18, 2025 at 11:07 AM Mark Adams <mfad...@lbl.gov> wrote: > >> Also, this uses the branch: adams/mat-rap-blocksize >> that has fixes to get the block sizes moved up in P'AP. >> >> On Tue, Feb 18, 2025 at 9:29 AM Mark Adams <mfad...@lbl.gov> wrote: >> >>> I've got a bug in pbjacobi that only shows up on* the Galerkin coarse >>> grid *(I have not been able to reproduce it a fine grid at least), and >>> in *parallel*, and on *GPUs*/kokkos. >>> >>> I have modified ex55 to take P from GAMG and give it (one) to PCMG with >>> Galerkin coarse grids,and solve (code and command lines appended). >>> I see this with ex56 (bs=3 & 6), ex55 (bs=2), but ex54 (bs=1) is fine >>> (does pbjacobi switch to jacobi?) >>> >>> With 4 processors I get these valgrind errors only on these bad solves >>> (no false positives). Note that *only one process has errors*, and note >>> some solver output before and after: >>> >>> I'm going to keeps digging but ideas are welcome, >>> Thanks, >>> Mark >>> >>> [0] <pc:gamg> PCSetUp_GAMG(): (null): 1) N=4, n data cols=2, nnz/row >>> (ave)=4, 1 active pes >>> [0] <pc:gamg> PCSetUp_GAMG(): (null): 2 levels, operator complexity = >>> 1.04 >>> [0] <pc:gamg> PCSetUp_GAMG(): (null): PCSetUp_GAMG: call >>> KSPChebyshevSetEigenvalues on level 0 (N=32) with emax = 2.26125 emin = >>> 0.0198344 >>> [0] <pc:gamg> PCSetUp_MG(): Using outer operators to define finest grid >>> operator >>> because PCMGGetSmoother(pc,nlevels-1,&ksp);KSPSetOperators(ksp,...); >>> was not called. >>> [0] <pc:mg> PCSetUp_MG(): Using outer operators to define finest grid >>> operator >>> because PCMGGetSmoother(pc,nlevels-1,&ksp);KSPSetOperators(ksp,...); >>> was not called. >>> ==978424== Invalid read of size 16 >>> ==978424== at 0x42B9AFA2: ??? (in /usr/lib64/libcuda.so.550.127.08) >>> ==978424== by 0x42AF0F12: ??? (in /usr/lib64/libcuda.so.550.127.08) >>> ==978424== by 0x42C3EE7B: ??? (in /usr/lib64/libcuda.so.550.127.08) >>> ==978424== by 0x42EEC474: ??? (in /usr/lib64/libcuda.so.550.127.08) >>> ==978424== by 0x42B4B495: ??? (in /usr/lib64/libcuda.so.550.127.08) >>> ==978424== by 0x42EC748F: ??? (in /usr/lib64/libcuda.so.550.127.08) >>> ==978424== by 0x42B44781: ??? (in /usr/lib64/libcuda.so.550.127.08) >>> ==978424== by 0x42CF766D: ??? (in /usr/lib64/libcuda.so.550.127.08) >>> ==978424== by 0x42445504: ??? (in >>> /opt/nvidia/hpc_sdk/Linux_x86_64/23.9/cuda/12.2/targets/x86_64-linux/lib/libcudart.so.12.2.53) >>> ==978424== by 0x42417A04: ??? (in >>> /opt/nvidia/hpc_sdk/Linux_x86_64/23.9/cuda/12.2/targets/x86_64-linux/lib/libcudart.so.12.2.53) >>> ==978424== by 0x42468730: cudaMemcpy (in >>> /opt/nvidia/hpc_sdk/Linux_x86_64/23.9/cuda/12.2/targets/x86_64-linux/lib/libcudart.so.12.2.53) >>> ==978424== by 0x4896979: cuda_memcpy_wrapper<> >>> (Kokkos_Cuda_Instance.hpp:365) >>> ==978424== by 0x4896979: Kokkos::Impl::DeepCopyCuda(void*, void >>> const*, unsigned long) (Kokkos_CudaSpace.cpp:62) >>> ==978424== Address 0xcb722dfc is 1,644 bytes inside a block of size >>> 1,652 alloc'd >>> ==978424== at 0x4E0A926: memalign (in >>> /usr/lib/valgrind/vgpreload_memcheck-amd64-linux.so) >>> ==978424== by 0x4E0AA69: posix_memalign (in >>> /usr/lib/valgrind/vgpreload_memcheck-amd64-linux.so) >>> ==978424== by 0x57E3BD9: PetscMallocAlign (mal.c:52) >>> ==978424== by 0x57E892F: PetscTrMallocDefault (mtr.c:175) >>> ==978424== by 0x57E5A77: PetscMallocA (mal.c:421) >>> ==978424== by 0x63B02F5: MatInvertBlockDiagonal_SeqAIJ (aij.c:3333) >>> ==978424== by 0x69B0599: MatInvertBlockDiagonal (matrix.c:10908) >>> ==978424== by 0x629D585: MatInvertBlockDiagonal_MPIAIJ (mpiaij.c:2588) >>> ==978424== by 0x69B0599: MatInvertBlockDiagonal (matrix.c:10908) >>> ==978424== by 0x7C7F726: PCSetUp_PBJacobi_Host (pbjacobi.c:256) >>> ==978424== by 0x7D737D8: PCSetUp_PBJacobi_Kokkos >>> (pbjacobi_kok.kokkos.cxx:90) >>> ==978424== by 0x7C803F8: PCSetUp_PBJacobi (pbjacobi.c:296) >>> ==978424== >>> ==978424== Invalid read of size 16 >>> ==978424== at 0x42B9AFB4: ??? (in /usr/lib64/libcuda.so.550.127.08) >>> ==978424== by 0x42AF0F12: ??? (in /usr/lib64/libcuda.so.550.127.08) >>> ==978424== by 0x42C3EE7B: ??? (in /usr/lib64/libcuda.so.550.127.08) >>> ==978424== by 0x42EEC474: ??? (in /usr/lib64/libcuda.so.550.127.08) >>> ==978424== by 0x42B4B495: ??? (in /usr/lib64/libcuda.so.550.127.08) >>> ==978424== by 0x42EC748F: ??? (in /usr/lib64/libcuda.so.550.127.08) >>> ==978424== by 0x42B44781: ??? (in /usr/lib64/libcuda.so.550.127.08) >>> ==978424== by 0x42CF766D: ??? (in /usr/lib64/libcuda.so.550.127.08) >>> ==978424== by 0x42445504: ??? (in >>> /opt/nvidia/hpc_sdk/Linux_x86_64/23.9/cuda/12.2/targets/x86_64-linux/lib/libcudart.so.12.2.53) >>> ==978424== by 0x42417A04: ??? (in >>> /opt/nvidia/hpc_sdk/Linux_x86_64/23.9/cuda/12.2/targets/x86_64-linux/lib/libcudart.so.12.2.53) >>> ==978424== by 0x42468730: cudaMemcpy (in >>> /opt/nvidia/hpc_sdk/Linux_x86_64/23.9/cuda/12.2/targets/x86_64-linux/lib/libcudart.so.12.2.53) >>> ==978424== by 0x4896979: cuda_memcpy_wrapper<> >>> (Kokkos_Cuda_Instance.hpp:365) >>> ==978424== by 0x4896979: Kokkos::Impl::DeepCopyCuda(void*, void >>> const*, unsigned long) (Kokkos_CudaSpace.cpp:62) >>> ==978424== Address 0xcb722e0c is 8 bytes after a block of size 1,652 >>> alloc'd >>> ==978424== at 0x4E0A926: memalign (in >>> /usr/lib/valgrind/vgpreload_memcheck-amd64-linux.so) >>> ==978424== by 0x4E0AA69: posix_memalign (in >>> /usr/lib/valgrind/vgpreload_memcheck-amd64-linux.so) >>> ==978424== by 0x57E3BD9: PetscMallocAlign (mal.c:52) >>> ==978424== by 0x57E892F: PetscTrMallocDefault (mtr.c:175) >>> ==978424== by 0x57E5A77: PetscMallocA (mal.c:421) >>> ==978424== by 0x63B02F5: MatInvertBlockDiagonal_SeqAIJ (aij.c:3333) >>> ==978424== by 0x69B0599: MatInvertBlockDiagonal (matrix.c:10908) >>> ==978424== by 0x629D585: MatInvertBlockDiagonal_MPIAIJ (mpiaij.c:2588) >>> ==978424== by 0x69B0599: MatInvertBlockDiagonal (matrix.c:10908) >>> ==978424== by 0x7C7F726: PCSetUp_PBJacobi_Host (pbjacobi.c:256) >>> ==978424== by 0x7D737D8: PCSetUp_PBJacobi_Kokkos >>> (pbjacobi_kok.kokkos.cxx:90) >>> ==978424== by 0x7C803F8: PCSetUp_PBJacobi (pbjacobi.c:296) >>> ==978424== >>> ==978424== Invalid read of size 4 >>> ==978424== at 0x42B9B103: ??? (in /usr/lib64/libcuda.so.550.127.08) >>> ==978424== by 0x42AF0F12: ??? (in /usr/lib64/libcuda.so.550.127.08) >>> ==978424== by 0x42C3EE7B: ??? (in /usr/lib64/libcuda.so.550.127.08) >>> ==978424== by 0x42EEC474: ??? (in /usr/lib64/libcuda.so.550.127.08) >>> ==978424== by 0x42B4B495: ??? (in /usr/lib64/libcuda.so.550.127.08) >>> ==978424== by 0x42EC748F: ??? (in /usr/lib64/libcuda.so.550.127.08) >>> ==978424== by 0x42B44781: ??? (in /usr/lib64/libcuda.so.550.127.08) >>> ==978424== by 0x42CF766D: ??? (in /usr/lib64/libcuda.so.550.127.08) >>> ==978424== by 0x42445504: ??? (in >>> /opt/nvidia/hpc_sdk/Linux_x86_64/23.9/cuda/12.2/targets/x86_64-linux/lib/libcudart.so.12.2.53) >>> ==978424== by 0x42417A04: ??? (in >>> /opt/nvidia/hpc_sdk/Linux_x86_64/23.9/cuda/12.2/targets/x86_64-linux/lib/libcudart.so.12.2.53) >>> ==978424== by 0x42468730: cudaMemcpy (in >>> /opt/nvidia/hpc_sdk/Linux_x86_64/23.9/cuda/12.2/targets/x86_64-linux/lib/libcudart.so.12.2.53) >>> ==978424== by 0x4896979: cuda_memcpy_wrapper<> >>> (Kokkos_Cuda_Instance.hpp:365) >>> ==978424== by 0x4896979: Kokkos::Impl::DeepCopyCuda(void*, void >>> const*, unsigned long) (Kokkos_CudaSpace.cpp:62) >>> ==978424== Address 0xcb722e1c is 12 bytes after a block of size 1,664 >>> in arena "client" >>> ==978424== >>> ==978424== Invalid read of size 4 >>> ==978424== at 0x42B9B107: ??? (in /usr/lib64/libcuda.so.550.127.08) >>> ==978424== by 0x42AF0F12: ??? (in /usr/lib64/libcuda.so.550.127.08) >>> ==978424== by 0x42C3EE7B: ??? (in /usr/lib64/libcuda.so.550.127.08) >>> ==978424== by 0x42EEC474: ??? (in /usr/lib64/libcuda.so.550.127.08) >>> ==978424== by 0x42B4B495: ??? (in /usr/lib64/libcuda.so.550.127.08) >>> ==978424== by 0x42EC748F: ??? (in /usr/lib64/libcuda.so.550.127.08) >>> ==978424== by 0x42B44781: ??? (in /usr/lib64/libcuda.so.550.127.08) >>> ==978424== by 0x42CF766D: ??? (in /usr/lib64/libcuda.so.550.127.08) >>> ==978424== by 0x42445504: ??? (in >>> /opt/nvidia/hpc_sdk/Linux_x86_64/23.9/cuda/12.2/targets/x86_64-linux/lib/libcudart.so.12.2.53) >>> ==978424== by 0x42417A04: ??? (in >>> /opt/nvidia/hpc_sdk/Linux_x86_64/23.9/cuda/12.2/targets/x86_64-linux/lib/libcudart.so.12.2.53) >>> ==978424== by 0x42468730: cudaMemcpy (in >>> /opt/nvidia/hpc_sdk/Linux_x86_64/23.9/cuda/12.2/targets/x86_64-linux/lib/libcudart.so.12.2.53) >>> ==978424== by 0x4896979: cuda_memcpy_wrapper<> >>> (Kokkos_Cuda_Instance.hpp:365) >>> ==978424== by 0x4896979: Kokkos::Impl::DeepCopyCuda(void*, void >>> const*, unsigned long) (Kokkos_CudaSpace.cpp:62) >>> ==978424== Address 0xcb722e1c is 12 bytes after a block of size 1,664 >>> in arena "client" >>> ==978424== >>> Residual norms for rap_mg_coarse_ solve. >>> 0 KSP Residual norm 1.118121970641e+01 >>> 1 KSP Residual norm 3.575439993035e-01 >>> [0]PETSC ERROR: --------------------- Error Message >>> -------------------------------------------------------------- >>> [0]PETSC ERROR: Diverged due to indefinite preconditioner, beta >>> -0.00553023, betaold 5.27744 >>> >>> >>> ksp/ex55: >>> >>> >>> PC pc; >>> PetscCall(KSPGetPC(ksp, &pc)); >>> PC_MG *mg = (PC_MG *)pc->data; >>> PC_MG_Levels **mglevels = mg->levels; >>> Mat P = mglevels[mg->nlevels-1]->interpolate; >>> PetscCall(MatViewFromOptions(mglevels[mg->nlevels-1]->A, NULL, >>> "-rap_mat_view")); >>> PetscCall(MatViewFromOptions(Amat, NULL, "-rap_mat_view")); >>> KSP ksp2; >>> PetscCall(KSPCreate(PETSC_COMM_WORLD, &ksp2)); >>> PetscCall(KSPSetOptionsPrefix(ksp2, "rap_")); >>> PetscCall(KSPSetFromOptions(ksp2)); >>> PetscCall(KSPGetPC(ksp2, &pc)); >>> PetscCall(KSPSetOperators(ksp2, Amat, Amat)); >>> PetscCall(PCMGSetGalerkin(pc, PC_MG_GALERKIN_PMAT)); >>> PetscCall(PCMGSetInterpolation(pc, 1, P)); >>> PetscCall(VecSet(bb, 1.0)); >>> PetscCall(PetscLogStagePush(stage[1])); >>> PetscCall(KSPSolve(ksp2, bb, xx)); >>> PetscCall(PetscLogStagePop()); >>> PetscCall(PetscFinalize()); >>> exit(12); >>> >>> >>> PetscCall(PetscLogStagePush(stage[1])); // original ex55 code >>> PetscCall(KSPSolve(ksp, bb, xx)); >>> >>> >>> *$ srun -n 4 valgrind --tool=memcheck --leak-check=no ./ex55 -ne 3 >>> -pc_type gamg -rap_pc_type mg -rap_ksp_monitor -rap_mg_levels_pc_type >>> jacobi -rap_mg_coarse_pc_type pbjacobi -rap_mg_coarse_ksp_monitor >>> -options_left -rap_pc_mg_levels 2 -rap_mg_coarse_ksp_type cg -mat_type >>> aijkokkos -fp_trap -ksp_monitor -rap_ksp_viewxx -info :pc,dm >>> -rap_mg_coarse_ksp_error_if_not_converged * >>> >>> >>> >>> >>> >>>