Perfect, that seems to have fixed the issue. Thanks for your help! Steven
On Thu, 3 Apr 2025 at 04:52, Junchao Zhang <junchao.zh...@gmail.com> wrote: > Hi, Steven, > Thanks for the test, which helped me easily find the petsc bug. I have > a fix at > https://urldefense.us/v3/__https://gitlab.com/petsc/petsc/-/merge_requests/8272__;!!G_uCfscf7eWS!ZNkvUNeiuBo6kRAR5k_yCtp_mOCmBVppGB_yvYsMwSiohil2UIhWXsli1sEeYZvqM0PkQe6HjVYqR8YxD-S1fT_OHcxzNRK2$ > . > VecKokkos does not use offloadmask for its gpu/cpu sync state, while > VecCUDA/VecHIP do. My expectation is users should not use > VecGetOffloadMask(), because it is too low level. We have bad API design > here. > > Thank you! > --Junchao Zhang > > > On Wed, Apr 2, 2025 at 7:11 PM Steven Dargaville < > dargaville.ste...@gmail.com> wrote: > >> Hi >> >> I have some code that does a solve with a PCMAT preconditioner. The mat >> used is a shell and inside the shell MatMult it calls VecPointwiseDivide >> with a vector "diag" that is the diagonal of a matrix assigned outside the >> shell. >> >> If I use mat/vec type of cuda, this occurs without any gpu/cpu copies as >> I would expect. If however I use mat/vec type kokkos, at every iteration of >> the solve there is a gpu/cpu copy that occurs. It seems this is triggered >> by the offloadmask in the vector "diag", as it stays as 1 and hence a copy >> occurs in VecPointwiseDivide. >> >> I would have expected the offload mask to be 256 (kokkos) after the first >> iteration, as the offload mask of "diag" changes to 3 when using cuda after >> the first iteration. >> >> Is this the expected behaviour with Kokkos, or is there something I need >> to do to trigger that "diag" has its values on the gpu to prevent copies? I >> have example c++ code that demonstrates this below. You can see the >> difference when run with petsc 3.23.0 and either "-log_view -mat_type >> aijcusparse -vec_type cuda" or "-log_view -mat_type aijkokkos -vec_type >> kokkos". >> >> Thanks for your help >> Steven >> >> Example c++ code: >> >> // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ >> >> static char help[] = "Tests Kokkos for SHELL matrices\n\n"; >> >> #include <iostream> >> #include <petscksp.h> >> #include <petsclog.h> >> >> typedef struct _n_User *User; >> struct _n_User { >> Mat A; >> Vec diag; >> }; >> >> static PetscErrorCode MatMult_User(Mat A, Vec X, Vec Y) >> { >> User user; >> >> PetscFunctionBegin; >> PetscCall(MatShellGetContext(A, &user)); >> >> // Print the offload mask inside the matmult >> PetscOffloadMask offloadmask; >> PetscCall(VecGetOffloadMask(X, &offloadmask)); >> std::cout << "offload inside X " << offloadmask << std::endl; >> PetscCall(VecGetOffloadMask(Y, &offloadmask)); >> std::cout << "offload inside Y " << offloadmask << std::endl; >> PetscCall(VecGetOffloadMask(user->diag, &offloadmask)); >> std::cout << "offload inside diag " << offloadmask << std::endl; >> >> PetscCall(VecPointwiseDivide(Y, X, user->diag)); >> PetscFunctionReturn(PETSC_SUCCESS); >> } >> >> int main(int argc, char **args) >> { >> const PetscScalar xvals[] = {11, 13}, yvals[] = {17, 19}; >> const PetscInt inds[] = {0, 1}; >> PetscScalar avals[] = {2, 3, 5, 7}; >> Mat S1, A; >> Vec X, Y, diag; >> KSP ksp; >> PC pc; >> User user; >> PetscLogStage stage1, gpu_copy; >> >> PetscFunctionBeginUser; >> PetscCall(PetscInitialize(&argc, &args, NULL, help)); >> >> // Build a matrix and vectors >> PetscCall(MatCreateFromOptions(PETSC_COMM_WORLD, NULL, 1, 2, 2, 2, 2, >> &A)); >> PetscCall(MatSetUp(A)); >> PetscCall(MatSetValues(A, 2, inds, 2, inds, avals, INSERT_VALUES)); >> PetscCall(MatAssemblyBegin(A, MAT_FINAL_ASSEMBLY)); >> PetscCall(MatAssemblyEnd(A, MAT_FINAL_ASSEMBLY)); >> PetscCall(MatCreateVecs(A, NULL, &X)); >> PetscCall(VecCreateFromOptions(PETSC_COMM_WORLD, NULL, 1, 2, 2, &X)); >> PetscCall(VecSetValues(X, 2, inds, xvals, INSERT_VALUES)); >> PetscCall(VecDuplicate(X, &Y)); >> PetscCall(VecDuplicate(X, &diag)); >> PetscCall(VecSetValues(Y, 2, inds, yvals, INSERT_VALUES)); >> PetscCall(VecAssemblyBegin(Y)); >> PetscCall(VecAssemblyEnd(Y)); >> >> // Create a shell matrix >> PetscCall(MatGetDiagonal(A, diag)); >> PetscCall(PetscNew(&user)); >> user->A = A; >> user->diag = diag; >> PetscCall(MatCreateShell(PETSC_COMM_WORLD, 2, 2, 2, 2, user, &S1)); >> PetscCall(MatSetUp(S1)); >> PetscCall(MatShellSetOperation(S1, MATOP_MULT, (void >> (*)(void))MatMult_User)); >> PetscCall(MatAssemblyBegin(S1, MAT_FINAL_ASSEMBLY)); >> PetscCall(MatAssemblyEnd(S1, MAT_FINAL_ASSEMBLY)); >> >> // Do a solve >> PetscCall(KSPCreate(PETSC_COMM_WORLD,&ksp)); >> // Give the ksp a pcmat as the preconditioner and the mat is the shell >> PetscCall(KSPSetOperators(ksp,A, S1)); >> PetscCall(KSPSetType(ksp, KSPRICHARDSON)); >> PetscCall(KSPSetFromOptions(ksp)); >> PetscCall(KSPGetPC(ksp, &pc)); >> PetscCall(PCSetType(pc, PCMAT)); >> PetscCall(KSPSetUp(ksp)); >> >> // Print the offload mask before our solve >> PetscOffloadMask offloadmask; >> PetscCall(VecGetOffloadMask(X, &offloadmask)); >> std::cout << "offload X " << offloadmask << std::endl; >> PetscCall(VecGetOffloadMask(Y, &offloadmask)); >> std::cout << "offload Y " << offloadmask << std::endl; >> PetscCall(VecGetOffloadMask(user->diag, &offloadmask)); >> std::cout << "offload diag " << offloadmask << std::endl; >> >> // Trigger any gpu copies in the first solve >> PetscCall(PetscLogStageRegister("gpu_copy",&gpu_copy)); >> PetscCall(PetscLogStagePush(gpu_copy)); >> PetscCall(KSPSolve(ksp, X, Y)); >> PetscCall(PetscLogStagePop()); >> >> // There should be no copies in this solve >> PetscCall(PetscLogStageRegister("no copy",&stage1)); >> PetscCall(PetscLogStagePush(stage1)); >> PetscCall(KSPSolve(ksp, X, Y)); >> PetscCall(PetscLogStagePop()); >> >> PetscCall(MatDestroy(&S1)); >> PetscCall(VecDestroy(&X)); >> PetscCall(VecDestroy(&Y)); >> PetscCall(PetscFinalize()); >> return 0; >> } >> >