From 89d9e95ab977aeea12c283e0da3e7da196b1b4da Mon Sep 17 00:00:00 2001 From: sdargavi Date: Tue, 31 Mar 2026 23:36:56 +0100 Subject: [PATCH 01/60] Add a fence before every MatSeqAIJGetCSRAndMemType call --- src/Gmres_Polyk.kokkos.cxx | 1 + src/Grid_Transferk.kokkos.cxx | 5 +++++ src/MatDiagDomk.kokkos.cxx | 2 ++ src/PETSc_Helperk.kokkos.cxx | 7 +++++++ src/PMISR_Modulek.kokkos.cxx | 3 +++ src/SAI_Zk.kokkos.cxx | 1 + 6 files changed, 19 insertions(+) diff --git a/src/Gmres_Polyk.kokkos.cxx b/src/Gmres_Polyk.kokkos.cxx index 3df0915a..ea0106f3 100644 --- a/src/Gmres_Polyk.kokkos.cxx +++ b/src/Gmres_Polyk.kokkos.cxx @@ -162,6 +162,7 @@ PETSC_INTERN void mat_mult_powers_share_sparsity_kokkos(Mat *input_mat, const in // Get pointers to the i,j,vals on the device // This should happen after all the (potentially) host matscale, mataxpy and matshift above // ~~~~~~~~~~~~ + Kokkos::fence(); const PetscInt *device_submat_i = nullptr, *device_submat_j = nullptr; PetscMemType mtype; PetscScalar *device_submat_vals = nullptr; diff --git a/src/Grid_Transferk.kokkos.cxx b/src/Grid_Transferk.kokkos.cxx index d48fe6d8..7ddfd733 100644 --- a/src/Grid_Transferk.kokkos.cxx +++ b/src/Grid_Transferk.kokkos.cxx @@ -56,6 +56,7 @@ PETSC_INTERN void generate_one_point_with_one_entry_from_sparse_kokkos(Mat *inpu // ~~~~~~~~~~~~ // Get pointers to the i,j,vals on the device // ~~~~~~~~~~~~ + Kokkos::fence(); const PetscInt *device_local_i = nullptr, *device_local_j = nullptr, *device_nonlocal_i = nullptr, *device_nonlocal_j = nullptr; PetscMemType mtype; PetscScalar *device_local_vals = nullptr, *device_nonlocal_vals = nullptr; @@ -375,6 +376,7 @@ PETSC_INTERN void compute_P_from_W_kokkos(Mat *input_mat, PetscInt global_row_st // ~~~~~~~~~~~~ // Get pointers to the i,j,vals on the device // ~~~~~~~~~~~~ + Kokkos::fence(); const PetscInt *device_local_i = nullptr, *device_local_j = nullptr, *device_nonlocal_i = nullptr, *device_nonlocal_j = nullptr; PetscMemType mtype; PetscScalar *device_local_vals = nullptr, *device_nonlocal_vals = nullptr; @@ -596,6 +598,7 @@ PETSC_INTERN void compute_P_from_W_kokkos(Mat *input_mat, PetscInt global_row_st // Annoyingly there isn't currently the ability to get views for i (or j) const PetscInt *device_local_i_output = nullptr, *device_nonlocal_i_ouput = nullptr; PetscMemType mtype; + Kokkos::fence(); PetscCallVoid(MatSeqAIJGetCSRAndMemType(mat_local_output, &device_local_i_output, NULL, NULL, &mtype)); if (mpi) PetscCallVoid(MatSeqAIJGetCSRAndMemType(mat_nonlocal_output, &device_nonlocal_i_ouput, NULL, NULL, &mtype)); @@ -863,6 +866,7 @@ PETSC_INTERN void compute_R_from_Z_kokkos(Mat *input_mat, PetscInt global_row_st // ~~~~~~~~~~~~ // Get pointers to the i,j,vals on the device // ~~~~~~~~~~~~ + Kokkos::fence(); const PetscInt *device_local_i = nullptr, *device_local_j = nullptr, *device_nonlocal_i = nullptr, *device_nonlocal_j = nullptr; PetscMemType mtype; PetscScalar *device_local_vals = nullptr, *device_nonlocal_vals = nullptr; @@ -1070,6 +1074,7 @@ PETSC_INTERN void compute_R_from_Z_kokkos(Mat *input_mat, PetscInt global_row_st // Annoyingly there isn't currently the ability to get views for i (or j) const PetscInt *device_local_i_output = nullptr, *device_local_j_output = nullptr, *device_nonlocal_i_ouput = nullptr; PetscMemType mtype; + Kokkos::fence(); PetscCallVoid(MatSeqAIJGetCSRAndMemType(mat_local_output, &device_local_i_output, &device_local_j_output, NULL, &mtype)); if (mpi) PetscCallVoid(MatSeqAIJGetCSRAndMemType(mat_nonlocal_output, &device_nonlocal_i_ouput, NULL, NULL, &mtype)); diff --git a/src/MatDiagDomk.kokkos.cxx b/src/MatDiagDomk.kokkos.cxx index 3b1a7062..dd6752be 100644 --- a/src/MatDiagDomk.kokkos.cxx +++ b/src/MatDiagDomk.kokkos.cxx @@ -96,6 +96,7 @@ PETSC_INTERN void MatDiagDomRatio_kokkos(Mat *input_mat, PetscReal *max_dd_ratio // ~~~~~~~~~~~~ // Get pointers to the local i,j,vals on the device // ~~~~~~~~~~~~ + Kokkos::fence(); const PetscInt *device_local_i = nullptr, *device_local_j = nullptr; PetscScalar *device_local_vals = nullptr; PetscCallVoid(MatSeqAIJGetCSRAndMemType(mat_local, &device_local_i, &device_local_j, &device_local_vals, &mtype)); @@ -184,6 +185,7 @@ PETSC_INTERN void MatDiagDomRatio_kokkos(Mat *input_mat, PetscReal *max_dd_ratio // ~~~~~~~~~~~~ // Get pointers to the nonlocal i,j,vals on the device // ~~~~~~~~~~~~ + Kokkos::fence(); const PetscInt *device_nonlocal_i = nullptr, *device_nonlocal_j = nullptr; PetscScalar *device_nonlocal_vals = nullptr; PetscCallVoid(MatSeqAIJGetCSRAndMemType(mat_nonlocal, &device_nonlocal_i, &device_nonlocal_j, &device_nonlocal_vals, &mtype)); diff --git a/src/PETSc_Helperk.kokkos.cxx b/src/PETSc_Helperk.kokkos.cxx index 464001a5..1c672a84 100644 --- a/src/PETSc_Helperk.kokkos.cxx +++ b/src/PETSc_Helperk.kokkos.cxx @@ -160,6 +160,7 @@ PETSC_INTERN void remove_small_from_sparse_kokkos(Mat *input_mat, const PetscRea // ~~~~~~~~~~~~ // Get pointers to the i,j,vals on the device // ~~~~~~~~~~~~ + Kokkos::fence(); const PetscInt *device_local_i = nullptr, *device_local_j = nullptr, *device_nonlocal_i = nullptr, *device_nonlocal_j = nullptr; PetscMemType mtype; PetscScalar *device_local_vals = nullptr, *device_nonlocal_vals = nullptr; @@ -882,6 +883,7 @@ PETSC_INTERN void remove_from_sparse_match_kokkos(Mat *input_mat, Mat *output_ma // ~~~~~~~~~~~~ // Get pointers to the i,j,vals on the device // ~~~~~~~~~~~~ + Kokkos::fence(); const PetscInt *device_local_i = nullptr, *device_local_j = nullptr, *device_nonlocal_i = nullptr, *device_nonlocal_j = nullptr; PetscMemType mtype; PetscScalar *device_local_vals = nullptr, *device_nonlocal_vals = nullptr; @@ -1186,6 +1188,7 @@ PETSC_INTERN void MatSetAllValues_kokkos(Mat *input_mat, PetscReal val) // ~~~~~~~~~~~~ // Get pointers to the i,j,vals on the device // ~~~~~~~~~~~~ + Kokkos::fence(); const PetscInt *device_local_i = nullptr, *device_local_j = nullptr, *device_nonlocal_i = nullptr, *device_nonlocal_j = nullptr; PetscMemType mtype; PetscScalar *device_local_vals = nullptr, *device_nonlocal_vals = nullptr; @@ -1273,6 +1276,7 @@ PETSC_INTERN void mat_duplicate_copy_plus_diag_kokkos(Mat *input_mat, const int // ~~~~~~~~~~~~ // Get pointers to the i,j,vals on the device // ~~~~~~~~~~~~ + Kokkos::fence(); const PetscInt *device_local_i = nullptr, *device_local_j = nullptr, *device_nonlocal_i = nullptr, *device_nonlocal_j = nullptr; PetscMemType mtype; PetscScalar *device_local_vals = nullptr, *device_nonlocal_vals = nullptr; @@ -1508,6 +1512,7 @@ PETSC_INTERN void mat_duplicate_copy_plus_diag_kokkos(Mat *input_mat, const int Kokkos::deep_copy(exec, a_local_d, 0.0); // Annoyingly there isn't currently the ability to get views for i (or j) + Kokkos::fence(); const PetscInt *device_local_i_output = nullptr, *device_local_j_output = nullptr, *device_nonlocal_i_ouput = nullptr; PetscMemType mtype; PetscCallVoid(MatSeqAIJGetCSRAndMemType(mat_local_output, &device_local_i_output, &device_local_j_output, NULL, &mtype)); @@ -1857,6 +1862,7 @@ PETSC_INTERN void MatCreateSubMatrix_Seq_kokkos(Mat *input_mat, PetscIntKokkosVi // ~~~~~~~~~~~~ // Get pointers to the i,j,vals on the device // ~~~~~~~~~~~~ + Kokkos::fence(); const PetscInt *device_local_i = nullptr, *device_local_j = nullptr; PetscMemType mtype; PetscScalar *device_local_vals = nullptr; @@ -2063,6 +2069,7 @@ PETSC_INTERN void MatCreateSubMatrix_Seq_kokkos(Mat *input_mat, PetscIntKokkosVi a_local_d = aijkok_local_output->a_dual.view_device(); // Annoyingly there isn't currently the ability to get views for i (or j) + Kokkos::fence(); const PetscInt *device_local_i_output = nullptr; PetscMemType mtype; PetscCallVoid(MatSeqAIJGetCSRAndMemType(*output_mat, &device_local_i_output, NULL, NULL, &mtype)); diff --git a/src/PMISR_Modulek.kokkos.cxx b/src/PMISR_Modulek.kokkos.cxx index a7f804a8..a2435434 100644 --- a/src/PMISR_Modulek.kokkos.cxx +++ b/src/PMISR_Modulek.kokkos.cxx @@ -50,6 +50,7 @@ PETSC_INTERN void pmisr_existing_measure_cf_markers_kokkos(Mat *strength_mat, co // ~~~~~~~~~~~~ // Get pointers to the i,j,vals on the device // ~~~~~~~~~~~~ + Kokkos::fence(); const PetscInt *device_local_i = nullptr, *device_local_j = nullptr, *device_nonlocal_i = nullptr, *device_nonlocal_j = nullptr; PetscMemType mtype; PetscScalar *device_local_vals = nullptr, *device_nonlocal_vals = nullptr; @@ -603,6 +604,7 @@ PETSC_INTERN void pmisr_existing_measure_implicit_transpose_kokkos(Mat *strength // ~~~~~~~~~~~~ // Get pointers to the i,j on the device for all the matrices we need // ~~~~~~~~~~~~ + Kokkos::fence(); const PetscInt *device_local_i_spst = nullptr, *device_local_j_spst = nullptr; const PetscInt *device_nonlocal_i = nullptr, *device_nonlocal_j = nullptr; const PetscInt *device_nonlocal_i_transpose = nullptr, *device_nonlocal_j_transpose = nullptr; @@ -1263,6 +1265,7 @@ PETSC_INTERN void pmisr_kokkos(Mat *strength_mat, const int max_luby_steps, cons // ~~~~~~~~~~~~ // Get pointers to the i,j,vals on the device // ~~~~~~~~~~~~ + Kokkos::fence(); const PetscInt *device_local_i = nullptr, *device_local_j = nullptr, *device_nonlocal_i = nullptr, *device_nonlocal_j = nullptr; PetscMemType mtype; PetscScalar *device_local_vals = nullptr, *device_nonlocal_vals = nullptr; diff --git a/src/SAI_Zk.kokkos.cxx b/src/SAI_Zk.kokkos.cxx index 7334b886..9f479b16 100644 --- a/src/SAI_Zk.kokkos.cxx +++ b/src/SAI_Zk.kokkos.cxx @@ -207,6 +207,7 @@ PETSC_INTERN void calculate_and_build_sai_z_kokkos(Mat *A_ff, Mat *A_cf, Mat *sp // ~~~~~~~~~~~~~~ // Get device CSR pointers for all matrices // ~~~~~~~~~~~~~~ + Kokkos::fence(); PetscMemType mtype; // Submatrix (non-local rows of A_ff) From bd7cb894ae2a44d89c23b70f4bd3a255304d84ea Mon Sep 17 00:00:00 2001 From: sdargavi Date: Wed, 1 Apr 2026 00:23:20 +0100 Subject: [PATCH 02/60] Ensure fences are called before MatCreateSeqAIJKokkosWithKokkosViews --- src/Device_Datak.kokkos.cxx | 2 ++ src/PETSc_Helperk.kokkos.cxx | 5 +++++ src/PMISR_Modulek.kokkos.cxx | 1 + 3 files changed, 8 insertions(+) diff --git a/src/Device_Datak.kokkos.cxx b/src/Device_Datak.kokkos.cxx index b11eb31f..be3b3ca6 100644 --- a/src/Device_Datak.kokkos.cxx +++ b/src/Device_Datak.kokkos.cxx @@ -173,6 +173,8 @@ PETSC_INTERN void create_cf_is_kokkos(Mat *input_mat, IS *is_fine, IS *is_coarse PetscCallVoid(PetscMalloc1(n_coarse, &is_coarse_array)); PetscIntKokkosViewHost is_coarse_h = PetscIntKokkosViewHost(is_coarse_array, n_coarse); + Kokkos::fence(); + // Copy over the indices to the host // Device to host so don't need to specify exec space Kokkos::deep_copy(is_fine_h, is_fine_local_d); diff --git a/src/PETSc_Helperk.kokkos.cxx b/src/PETSc_Helperk.kokkos.cxx index 1c672a84..6848eb53 100644 --- a/src/PETSc_Helperk.kokkos.cxx +++ b/src/PETSc_Helperk.kokkos.cxx @@ -43,6 +43,7 @@ PETSC_INTERN void rewrite_j_global_to_local(PetscInt colmap_max_size, PetscInt & PetscIntKokkosView j_nonlocal_d_sorted("j_nonlocal_d_sorted", j_nonlocal_d.extent(0)); Kokkos::deep_copy(exec, j_nonlocal_d_sorted, j_nonlocal_d); Kokkos::sort(j_nonlocal_d_sorted); + Kokkos::fence(); // Unique copy returns a copy of sorted j_nonlocal_d_sorted in order, but with all the duplicate entries removed auto unique_end_it = Kokkos::Experimental::unique_copy(exec, j_nonlocal_d_sorted, colmap_output_d); @@ -1208,6 +1209,7 @@ PETSC_INTERN void MatSetAllValues_kokkos(Mat *input_mat, PetscReal val) Kokkos::deep_copy(exec, a_nonlocal_d, val); PetscCallVoid(PetscLogCpuToGpu(bytes)); } + Kokkos::fence(); // Have to specify we've modifed data on the device // Want to call MatSeqAIJKokkosModifyDevice but its PETSC_INTERN @@ -1740,6 +1742,7 @@ PETSC_INTERN void MatAXPY_kokkos(Mat *Y, PetscScalar alpha, Mat *X) Kokkos::deep_copy(exec, i_local_d_copy, i_local_d_z); Kokkos::deep_copy(exec, j_local_d_copy, j_local_d_z); } + Kokkos::fence(); // We can create our local diagonal block matrix directly on the device Mat Z_local; @@ -1831,6 +1834,8 @@ PETSC_INTERN void MatAXPY_kokkos(Mat *Y, PetscScalar alpha, Mat *X) Kokkos::deep_copy(exec, j_nonlocal_d_copy, j_nonlocal_d_z); } + Kokkos::fence(); + // We can create our nonlocal diagonal block matrix directly on the device Mat Z_nonlocal; PetscCallVoid(MatCreateSeqAIJKokkosWithKokkosViews(PETSC_COMM_SELF, local_rows, col_ao_output, i_nonlocal_d_copy, j_nonlocal_d_copy, a_nonlocal_d_copy, &Z_nonlocal)); diff --git a/src/PMISR_Modulek.kokkos.cxx b/src/PMISR_Modulek.kokkos.cxx index a2435434..acf28e5d 100644 --- a/src/PMISR_Modulek.kokkos.cxx +++ b/src/PMISR_Modulek.kokkos.cxx @@ -1312,6 +1312,7 @@ PETSC_INTERN void pmisr_kokkos(Mat *strength_mat, const int max_luby_steps, cons // Flip the sign if pmis if (pmis_int == 1) measure_local_d(i) *= -1; }); + Kokkos::fence(); // Call the existing measure cf markers function pmisr_existing_measure_cf_markers_kokkos(strength_mat, max_luby_steps, pmis_int, measure_local_d, cf_markers_d, zero_measure_c_point_int); From cc6e947e58868c27e1a33718f5d63c56ba43eb34 Mon Sep 17 00:00:00 2001 From: sdargavi Date: Wed, 1 Apr 2026 01:07:24 +0100 Subject: [PATCH 03/60] Fence the sorting more carefully --- src/Grid_Transferk.kokkos.cxx | 1 + src/PETSc_Helperk.kokkos.cxx | 18 +++++++++++++----- 2 files changed, 14 insertions(+), 5 deletions(-) diff --git a/src/Grid_Transferk.kokkos.cxx b/src/Grid_Transferk.kokkos.cxx index 7ddfd733..d8f92da0 100644 --- a/src/Grid_Transferk.kokkos.cxx +++ b/src/Grid_Transferk.kokkos.cxx @@ -1159,6 +1159,7 @@ PETSC_INTERN void compute_R_from_Z_kokkos(Mat *input_mat, PetscInt global_row_st // Now we have to sort the local column indices, as we add in the identity at the // end of our local j indices KokkosCsrMatrix csrmat_local = KokkosCsrMatrix("csrmat_local", local_rows_z, local_full_cols, a_local_d.extent(0), a_local_d, i_local_d, j_local_d); + Kokkos::fence(); KokkosSparse::sort_crs_matrix(csrmat_local); // Let's make sure everything on the device is finished diff --git a/src/PETSc_Helperk.kokkos.cxx b/src/PETSc_Helperk.kokkos.cxx index 6848eb53..4c13de3a 100644 --- a/src/PETSc_Helperk.kokkos.cxx +++ b/src/PETSc_Helperk.kokkos.cxx @@ -42,7 +42,7 @@ PETSC_INTERN void rewrite_j_global_to_local(PetscInt colmap_max_size, PetscInt & { PetscIntKokkosView j_nonlocal_d_sorted("j_nonlocal_d_sorted", j_nonlocal_d.extent(0)); Kokkos::deep_copy(exec, j_nonlocal_d_sorted, j_nonlocal_d); - Kokkos::sort(j_nonlocal_d_sorted); + Kokkos::sort(exec, j_nonlocal_d_sorted); Kokkos::fence(); // Unique copy returns a copy of sorted j_nonlocal_d_sorted in order, but with all the duplicate entries removed @@ -770,6 +770,7 @@ PETSC_INTERN void remove_small_from_sparse_kokkos(Mat *input_mat, const PetscRea if (added_any_diagonal) { KokkosCsrMatrix csrmat_local = KokkosCsrMatrix("csrmat_local", local_rows, local_cols, a_local_d.extent(0), a_local_d, i_local_d, j_local_d); + Kokkos::fence(); KokkosSparse::sort_crs_matrix(csrmat_local); if (mpi) @@ -777,6 +778,7 @@ PETSC_INTERN void remove_small_from_sparse_kokkos(Mat *input_mat, const PetscRea // The column size is not right here (it will be <= cols_ao) // but it shouldn't matter as we are only construting an explicit kokkos csr matrix here so it can sort KokkosCsrMatrix csrmat_nonlocal = KokkosCsrMatrix("csrmat_nonlocal", local_rows, cols_ao, a_nonlocal_d.extent(0), a_nonlocal_d, i_nonlocal_d, j_nonlocal_d); + Kokkos::fence(); KokkosSparse::sort_crs_matrix(csrmat_nonlocal); } } @@ -1603,6 +1605,7 @@ PETSC_INTERN void mat_duplicate_copy_plus_diag_kokkos(Mat *input_mat, const int // Now we have to sort the local column indices, as we add in the identity at the // end of our local j indices KokkosCsrMatrix csrmat_local = KokkosCsrMatrix("csrmat_local", local_rows, local_cols, a_local_d.extent(0), a_local_d, i_local_d, j_local_d); + Kokkos::fence(); KokkosSparse::sort_crs_matrix(csrmat_local); // Let's make sure everything on the device is finished @@ -1723,8 +1726,8 @@ PETSC_INTERN void MatAXPY_kokkos(Mat *Y, PetscScalar alpha, Mat *X) KernelHandle kh_local; kh_local.create_spadd_handle(true); // X, Y are sorted - KokkosSparse::spadd_symbolic(&kh_local, xkok_local->csrmat, ykok_local->csrmat, zcsr_local); - KokkosSparse::spadd_numeric(&kh_local, alpha, xkok_local->csrmat, (PetscScalar)1.0, ykok_local->csrmat, zcsr_local); + KokkosSparse::spadd_symbolic(exec, &kh_local, xkok_local->csrmat, ykok_local->csrmat, zcsr_local); + KokkosSparse::spadd_numeric(exec, &kh_local, alpha, xkok_local->csrmat, (PetscScalar)1.0, ykok_local->csrmat, zcsr_local); kh_local.destroy_spadd_handle(); @@ -1738,6 +1741,9 @@ PETSC_INTERN void MatAXPY_kokkos(Mat *Y, PetscScalar alpha, Mat *X) i_local_d_copy = Kokkos::View("i_local_d_copy", i_local_d_z.extent(0)); j_local_d_copy = Kokkos::View("j_local_d_copy", j_local_d_z.extent(0)); + // Let's make sure everything on the device is finished + Kokkos::fence(); + Kokkos::deep_copy(exec, a_local_d_copy, a_local_d_z); Kokkos::deep_copy(exec, i_local_d_copy, i_local_d_z); Kokkos::deep_copy(exec, j_local_d_copy, j_local_d_z); @@ -1798,11 +1804,13 @@ PETSC_INTERN void MatAXPY_kokkos(Mat *Y, PetscScalar alpha, Mat *X) KernelHandle kh_nonlocal; kh_nonlocal.create_spadd_handle(true); - KokkosSparse::spadd_symbolic(&kh_nonlocal, xkok_nonlocal->csrmat, ykok_nonlocal->csrmat, zcsr_nonlocal); - KokkosSparse::spadd_numeric(&kh_nonlocal, alpha, xkok_nonlocal->csrmat, (PetscScalar)1.0, ykok_nonlocal->csrmat, zcsr_nonlocal); + KokkosSparse::spadd_symbolic(exec, &kh_nonlocal, xkok_nonlocal->csrmat, ykok_nonlocal->csrmat, zcsr_nonlocal); + KokkosSparse::spadd_numeric(exec, &kh_nonlocal, alpha, xkok_nonlocal->csrmat, (PetscScalar)1.0, ykok_nonlocal->csrmat, zcsr_nonlocal); kh_nonlocal.destroy_spadd_handle(); + Kokkos::fence(); + // Can now destroy the copy PetscCallVoid(MatDestroy(&mat_nonlocal_x_copy)); From 38390e4ab3947e29128deeba7a57e97de98e23dd Mon Sep 17 00:00:00 2001 From: sdargavi Date: Wed, 1 Apr 2026 01:38:48 +0100 Subject: [PATCH 04/60] Add fence after unique_copy --- src/PETSc_Helperk.kokkos.cxx | 1 + 1 file changed, 1 insertion(+) diff --git a/src/PETSc_Helperk.kokkos.cxx b/src/PETSc_Helperk.kokkos.cxx index 4c13de3a..666a9879 100644 --- a/src/PETSc_Helperk.kokkos.cxx +++ b/src/PETSc_Helperk.kokkos.cxx @@ -47,6 +47,7 @@ PETSC_INTERN void rewrite_j_global_to_local(PetscInt colmap_max_size, PetscInt & // Unique copy returns a copy of sorted j_nonlocal_d_sorted in order, but with all the duplicate entries removed auto unique_end_it = Kokkos::Experimental::unique_copy(exec, j_nonlocal_d_sorted, colmap_output_d); + Kokkos::fence(); auto begin_it = Kokkos::Experimental::begin(colmap_output_d); count_ptr_arith = unique_end_it - begin_it; } From edf2fd8be6ac07788bc8c4d52b42f34ef13d7263 Mon Sep 17 00:00:00 2001 From: sdargavi Date: Wed, 1 Apr 2026 12:31:00 +0100 Subject: [PATCH 05/60] Fence before off-diagonal MatCreateSubMatrix_Seq_kokkos call --- src/PETSc_Helperk.kokkos.cxx | 1 + 1 file changed, 1 insertion(+) diff --git a/src/PETSc_Helperk.kokkos.cxx b/src/PETSc_Helperk.kokkos.cxx index 666a9879..fc64f79e 100644 --- a/src/PETSc_Helperk.kokkos.cxx +++ b/src/PETSc_Helperk.kokkos.cxx @@ -2407,6 +2407,7 @@ PETSC_INTERN void MatCreateSubMatrix_kokkos_view(Mat *input_mat, PetscIntKokkosV } // We can now create the off-diagonal component + Kokkos::fence(); MatCreateSubMatrix_Seq_kokkos(&mat_nonlocal, is_row_d_d, is_col_o_d, reuse_int, &output_mat_nonlocal); // If it's our first time through we have to create our output matrix From 90d3ed90cf0b1133af77ff1417cd1167e765e5aa Mon Sep 17 00:00:00 2001 From: sdargavi Date: Wed, 1 Apr 2026 12:33:13 +0100 Subject: [PATCH 06/60] Make scatters synchronous --- src/MatDiagDomk.kokkos.cxx | 2 +- src/PETSc_Helperk.kokkos.cxx | 10 ++++------ src/PMISR_Modulek.kokkos.cxx | 10 ++++------ 3 files changed, 9 insertions(+), 13 deletions(-) diff --git a/src/MatDiagDomk.kokkos.cxx b/src/MatDiagDomk.kokkos.cxx index dd6752be..ced90f5a 100644 --- a/src/MatDiagDomk.kokkos.cxx +++ b/src/MatDiagDomk.kokkos.cxx @@ -87,6 +87,7 @@ PETSC_INTERN void MatDiagDomRatio_kokkos(Mat *input_mat, PetscReal *max_dd_ratio // Ensure send/receive buffers are stable before Begin. Kokkos::fence(); PetscCallVoid(VecScatterBegin(mat_mpi->Mvctx, scatter_root_vec, mat_mpi->lvec, INSERT_VALUES, SCATTER_FORWARD)); + PetscCallVoid(VecScatterEnd(mat_mpi->Mvctx, scatter_root_vec, mat_mpi->lvec, INSERT_VALUES, SCATTER_FORWARD)); } // ~~~~~~~~~~~~~~~ @@ -159,7 +160,6 @@ PETSC_INTERN void MatDiagDomRatio_kokkos(Mat *input_mat, PetscReal *max_dd_ratio // Finish the in-flight scatter and only then read from the receive buffer. if (mpi) { - PetscCallVoid(VecScatterEnd(mat_mpi->Mvctx, scatter_root_vec, mat_mpi->lvec, INSERT_VALUES, SCATTER_FORWARD)); { ConstPetscScalarKokkosView lvec_scalar_d; PetscCallVoid(VecGetKokkosView(mat_mpi->lvec, &lvec_scalar_d)); diff --git a/src/PETSc_Helperk.kokkos.cxx b/src/PETSc_Helperk.kokkos.cxx index fc64f79e..05a46cf1 100644 --- a/src/PETSc_Helperk.kokkos.cxx +++ b/src/PETSc_Helperk.kokkos.cxx @@ -2282,6 +2282,8 @@ PETSC_INTERN void MatCreateSubMatrix_kokkos_view(Mat *input_mat, PetscIntKokkosV // Ensure send/receive buffers are stable before Begin. Kokkos::fence(); PetscCallVoid(VecScatterBegin(mat_mpi->Mvctx, x_vec, mat_mpi->lvec, INSERT_VALUES, SCATTER_FORWARD)); + // x scatter completed: mat_mpi->lvec is now safe to read. + PetscCallVoid(VecScatterEnd(mat_mpi->Mvctx, x_vec, mat_mpi->lvec, INSERT_VALUES, SCATTER_FORWARD)); // Fill cmap_vec on device: cmap[is_col(i)] = i + isstart, rest = -1 { @@ -2305,13 +2307,12 @@ PETSC_INTERN void MatCreateSubMatrix_kokkos_view(Mat *input_mat, PetscIntKokkosV auto is_col_o_match_d = PetscIntKokkosView("is_col_o_match_d", cols_ao+1); Kokkos::deep_copy(exec, is_col_o_match_d, 0); - // x scatter completed: mat_mpi->lvec is now safe to read. - PetscCallVoid(VecScatterEnd(mat_mpi->Mvctx, x_vec, mat_mpi->lvec, INSERT_VALUES, SCATTER_FORWARD)); - // Start cmap scatter only after finishing x scatter on the same Mvctx. // Ensure send/receive buffers are stable before Begin. Kokkos::fence(); PetscCallVoid(VecScatterBegin(mat_mpi->Mvctx, cmap_vec, lcmap_vec, INSERT_VALUES, SCATTER_FORWARD)); + // cmap scatter completed: lcmap_vec is now safe to read. + PetscCallVoid(VecScatterEnd(mat_mpi->Mvctx, cmap_vec, lcmap_vec, INSERT_VALUES, SCATTER_FORWARD)); if (cols_ao > 0) { @@ -2350,9 +2351,6 @@ PETSC_INTERN void MatCreateSubMatrix_kokkos_view(Mat *input_mat, PetscIntKokkosV is_col_o_d = PetscIntKokkosView("is_col_o_d", col_ao_output); garray_output_d = PetscIntKokkosView("garray_output_d", col_ao_output); - // cmap scatter completed: lcmap_vec is now safe to read. - PetscCallVoid(VecScatterEnd(mat_mpi->Mvctx, cmap_vec, lcmap_vec, INSERT_VALUES, SCATTER_FORWARD)); - // Loop over all the cols in the input matrix { ConstPetscScalarKokkosView lcmap_scalar_d; diff --git a/src/PMISR_Modulek.kokkos.cxx b/src/PMISR_Modulek.kokkos.cxx index acf28e5d..fed521d6 100644 --- a/src/PMISR_Modulek.kokkos.cxx +++ b/src/PMISR_Modulek.kokkos.cxx @@ -196,6 +196,8 @@ PETSC_INTERN void pmisr_existing_measure_cf_markers_kokkos(Mat *strength_mat, co // Ensure the root buffer is no longer being written before Begin. Kokkos::fence(); PetscCallVoid(VecScatterBegin(mat_mpi->Mvctx, scatter_root_vec, scatter_leaf_vec, INSERT_VALUES, SCATTER_FORWARD)); + // Complete the in-flight forward scatter before reading the receive buffer. + PetscCallVoid(VecScatterEnd(mat_mpi->Mvctx, scatter_root_vec, scatter_leaf_vec, INSERT_VALUES, SCATTER_FORWARD)); } @@ -262,9 +264,6 @@ PETSC_INTERN void pmisr_existing_measure_cf_markers_kokkos(Mat *strength_mat, co // ~~~~~~~~ if (mpi) { - // Complete the in-flight forward scatter before reading the receive buffer. - PetscCallVoid(VecScatterEnd(mat_mpi->Mvctx, scatter_root_vec, scatter_leaf_vec, INSERT_VALUES, SCATTER_FORWARD)); - // Convert PetscScalar → int after End, when the receive buffer is complete. { ConstPetscScalarKokkosView leaf_scalar_d; @@ -383,6 +382,8 @@ PETSC_INTERN void pmisr_existing_measure_cf_markers_kokkos(Mat *strength_mat, co // Ensure send/receive buffers are stable before Begin. Kokkos::fence(); PetscCallVoid(VecScatterBegin(mat_mpi->Mvctx, scatter_leaf_vec, scatter_root_vec, ADD_VALUES, SCATTER_REVERSE)); + // Complete reverse scatter before reading reduced root buffer. + PetscCallVoid(VecScatterEnd(mat_mpi->Mvctx, scatter_leaf_vec, scatter_root_vec, ADD_VALUES, SCATTER_REVERSE)); // While reverse scatter is in-flight, do local-only updates in cf_markers_temp_d. // This must not touch scatter_root_vec/scatter_leaf_vec. @@ -411,9 +412,6 @@ PETSC_INTERN void pmisr_existing_measure_cf_markers_kokkos(Mat *strength_mat, co } }); - // Complete reverse scatter before reading reduced root buffer. - PetscCallVoid(VecScatterEnd(mat_mpi->Mvctx, scatter_leaf_vec, scatter_root_vec, ADD_VALUES, SCATTER_REVERSE)); - // Convert PetscScalar → int back to cf_markers_d after End. { ConstPetscScalarKokkosView root_scalar_d; From 8e6e36b537189f2415259accbc203012225961fb Mon Sep 17 00:00:00 2001 From: sdargavi Date: Wed, 1 Apr 2026 20:15:39 +0100 Subject: [PATCH 07/60] Add fence before ISCopyLocal --- src/VecISCopyLocalk.kokkos.cxx | 1 + 1 file changed, 1 insertion(+) diff --git a/src/VecISCopyLocalk.kokkos.cxx b/src/VecISCopyLocalk.kokkos.cxx index f5b1ff74..8b653c96 100644 --- a/src/VecISCopyLocalk.kokkos.cxx +++ b/src/VecISCopyLocalk.kokkos.cxx @@ -124,6 +124,7 @@ PETSC_INTERN void set_VecISCopyLocal_kokkos_our_level(int our_level, PetscInt gl // Do the equivalent of veciscopy on local data using the IS data on the device PETSC_INTERN void VecISCopyLocal_kokkos(int our_level, int fine_int, Vec *vfull, int mode_int, Vec *vreduced) { + Kokkos::fence(); const int level_idx = our_level - 1; // Can't use the shared pointer directly within the parallel From 2d3ed11f3bf71df71b1f5f1c2cb7494c01ce8e30 Mon Sep 17 00:00:00 2001 From: sdargavi Date: Thu, 2 Apr 2026 03:36:49 +0100 Subject: [PATCH 08/60] Ensure sort_crs_matrix and spadd in Kokkos are only performed on matrices with local column indices to prevent OOB errors --- src/PETSc_Helperk.kokkos.cxx | 239 ++++++++++++++++++++++------------- 1 file changed, 149 insertions(+), 90 deletions(-) diff --git a/src/PETSc_Helperk.kokkos.cxx b/src/PETSc_Helperk.kokkos.cxx index 05a46cf1..8c6fe9ed 100644 --- a/src/PETSc_Helperk.kokkos.cxx +++ b/src/PETSc_Helperk.kokkos.cxx @@ -25,20 +25,37 @@ static PetscErrorCode check_exact_petscint_to_scalar_encoding(PetscInt max_encod //------------------------------------------------------------------------------------------------------------------------ -// Generate the colmap and rewrite input global j indices to local given the calculated colmap -PETSC_INTERN void rewrite_j_global_to_local(PetscInt colmap_max_size, PetscInt &col_ao_output, PetscIntKokkosView j_nonlocal_d, PetscInt **garray_host) +// Remap each entry in j_d from a global index to its local index via binary search into garray_d. +// garray_d must be a sorted array of unique global indices. +// Fences internally. +static void remap_j_to_local_device(PetscIntKokkosView j_d, PetscIntKokkosView garray_d, PetscInt col_ao_output) +{ + auto exec = PetscGetKokkosExecutionSpace(); + + if (j_d.extent(0) == 0) return; + Kokkos::parallel_for( + Kokkos::RangePolicy<>(exec, 0, j_d.extent(0)), KOKKOS_LAMBDA(const PetscInt i) { + j_d(i) = binary_search_sorted(garray_d, col_ao_output, j_d(i)); + }); + Kokkos::fence(); +} + +//------------------------------------------------------------------------------------------------------------------------ + +// Build garray on device from global indices in j_nonlocal_d and remap j_nonlocal_d to local in-place. +// garray_d (out) is a device view of the sorted unique global column indices (size col_ao_output). +static void rewrite_j_global_to_local_device(PetscInt colmap_max_size, PetscInt &col_ao_output, PetscIntKokkosView j_nonlocal_d, PetscIntKokkosView &garray_d) { auto exec = PetscGetKokkosExecutionSpace(); // Need to preallocate to the max size - PetscIntKokkosView colmap_output_d("colmap_output_d", colmap_max_size); + PetscIntKokkosView colmap_output_d("colmap_output_d", colmap_max_size); col_ao_output = 0; - // Take a copy of j and sort it and then build garray if (j_nonlocal_d.extent(0) > 0) { ptrdiff_t count_ptr_arith = -1; - // Scoped so we don't keep the copy of j around very long + // Scoped so we don't keep the sorted copy of j around very long { PetscIntKokkosView j_nonlocal_d_sorted("j_nonlocal_d_sorted", j_nonlocal_d.extent(0)); Kokkos::deep_copy(exec, j_nonlocal_d_sorted, j_nonlocal_d); @@ -53,54 +70,36 @@ PETSC_INTERN void rewrite_j_global_to_local(PetscInt colmap_max_size, PetscInt & } col_ao_output = static_cast(count_ptr_arith); - // Create some host space for the output garray (that stays in scope) and copy it - PetscCallVoid(PetscMalloc1(col_ao_output, garray_host)); - PetscIntKokkosViewHost colmap_output_h = PetscIntKokkosViewHost(*garray_host, col_ao_output); PetscInt zero = 0; + garray_d = Kokkos::subview(colmap_output_d, Kokkos::make_pair(zero, col_ao_output)); + + // Remap j_nonlocal_d to local indices using binary search into garray_d + // This fences internally + remap_j_to_local_device(j_nonlocal_d, garray_d, col_ao_output); + } +} + +//------------------------------------------------------------------------------------------------------------------------ + +// Generate the colmap and rewrite input global j indices to local given the calculated colmap +PETSC_INTERN void rewrite_j_global_to_local(PetscInt colmap_max_size, PetscInt &col_ao_output, PetscIntKokkosView j_nonlocal_d, PetscInt **garray_host) +{ + auto exec = PetscGetKokkosExecutionSpace(); + PetscIntKokkosView garray_d; + + // This fences internally + rewrite_j_global_to_local_device(colmap_max_size, col_ao_output, j_nonlocal_d, garray_d); + + // Always allocate host array (even zero-size) + PetscCallVoid(PetscMalloc1(col_ao_output, garray_host)); + if (col_ao_output > 0) + { + PetscIntKokkosViewHost colmap_output_h = PetscIntKokkosViewHost(*garray_host, col_ao_output); // Device to host so don't need to specify exec space - Kokkos::deep_copy(colmap_output_h, Kokkos::subview(colmap_output_d, Kokkos::make_pair(zero, col_ao_output))); + Kokkos::deep_copy(colmap_output_h, garray_d); // Log copy with petsc size_t bytes = col_ao_output * sizeof(PetscInt); - PetscCallVoid(PetscLogGpuToCpu(bytes)); - } - - // ~~~~~~~~~~ - // Now we can go and overwrite the global indices in j with the local equivalents - // ~~~~~~~~~~ - // Do we have any nonlocal columns - if (col_ao_output == 0) - { - // Silly but depending on the compiler this may return a non-null pointer - col_ao_output = 0; - PetscCallVoid(PetscMalloc1(col_ao_output, garray_host)); - } - else - { - // Binary search sorted colmap to find our local index - // Originally used Kokkos::UnorderedMap here but it only handles up to uint32_t - // entries - Kokkos::parallel_for( - Kokkos::RangePolicy<>(exec, 0, j_nonlocal_d.extent(0)), KOKKOS_LAMBDA(const PetscInt i) { - - PetscInt low = 0; - PetscInt count = col_ao_output; // Number of elements in colmap_output_d - PetscInt step = -1; - PetscInt mid_idx = -1; - - while (count > 0) { - step = count / 2; - mid_idx = low + step; - if (colmap_output_d(mid_idx) < j_nonlocal_d(i)) { - low = mid_idx + 1; - count -= (step + 1); - } else { - count = step; - } - } - j_nonlocal_d(i) = low; - }); - // Ensure the rewrite is finished before we return - Kokkos::fence(); + PetscCallVoid(PetscLogGpuToCpu(bytes)); } } @@ -752,9 +751,19 @@ PETSC_INTERN void remove_small_from_sparse_kokkos(Mat *input_mat, const PetscRea // Let's make sure everything on the device is finished Kokkos::fence(); + // Convert j_nonlocal_d from global to local indices now, before any sort below. + // All global indices (including any diagonals added in the loop above) are finalised. + // garray_d holds the sorted unique global column indices on device. + PetscIntKokkosView garray_d; + PetscInt col_ao_output = 0; + if (mpi) { + // This fences internally + rewrite_j_global_to_local_device(cols_ao, col_ao_output, j_nonlocal_d, garray_d); + } + // Now we may have to sort the column indices if (lump_int) - { + { // Reduce to see if we ever added a diagonal bool added_any_diagonal = false; Kokkos::parallel_reduce( @@ -764,23 +773,22 @@ PETSC_INTERN void remove_small_from_sparse_kokkos(Mat *input_mat, const PetscRea if (!existing_diag_d(i)) thread_result = true; }, Kokkos::LOr(added_any_diagonal) - ); + ); // If we did add a diagonal, it got added to the end of the columns on each row, so will have to sort // It also could have been added to either the local or nonlocal components given not square - if (added_any_diagonal) + if (added_any_diagonal) { - KokkosCsrMatrix csrmat_local = KokkosCsrMatrix("csrmat_local", local_rows, local_cols, a_local_d.extent(0), a_local_d, i_local_d, j_local_d); - Kokkos::fence(); - KokkosSparse::sort_crs_matrix(csrmat_local); - + KokkosCsrMatrix csrmat_local = KokkosCsrMatrix("csrmat_local", local_rows, local_cols, a_local_d.extent(0), a_local_d, i_local_d, j_local_d); + Kokkos::fence(); + KokkosSparse::sort_crs_matrix(csrmat_local); + if (mpi) { - // The column size is not right here (it will be <= cols_ao) - // but it shouldn't matter as we are only construting an explicit kokkos csr matrix here so it can sort - KokkosCsrMatrix csrmat_nonlocal = KokkosCsrMatrix("csrmat_nonlocal", local_rows, cols_ao, a_nonlocal_d.extent(0), a_nonlocal_d, i_nonlocal_d, j_nonlocal_d); - Kokkos::fence(); - KokkosSparse::sort_crs_matrix(csrmat_nonlocal); + // j_nonlocal_d now contains local indices; use col_ao_output as numCols + KokkosCsrMatrix csrmat_nonlocal = KokkosCsrMatrix("csrmat_nonlocal", local_rows, col_ao_output, a_nonlocal_d.extent(0), a_nonlocal_d, i_nonlocal_d, j_nonlocal_d); + Kokkos::fence(); + KokkosSparse::sort_crs_matrix(csrmat_nonlocal); } } } @@ -791,23 +799,27 @@ PETSC_INTERN void remove_small_from_sparse_kokkos(Mat *input_mat, const PetscRea PetscCallVoid(MatCreateSeqAIJKokkosWithKokkosViews(PETSC_COMM_SELF, local_rows, local_cols, i_local_d, j_local_d, a_local_d, &output_mat_local)); // we also have to go and build the a, i, j for the non-local off-diagonal block - if (mpi) + if (mpi) { - // Now we need to build garray on the host and rewrite the j_nonlocal_d indices so they are local - // The default values here are for the case where we - // let petsc do it, it resets this internally in MatSetUpMultiply_MPIAIJ + // Copy device garray to host PetscInt *garray_host = NULL; - PetscInt col_ao_output = 0; - // This fences internally - rewrite_j_global_to_local(cols_ao, col_ao_output, j_nonlocal_d, &garray_host); + PetscCallVoid(PetscMalloc1(col_ao_output, &garray_host)); + if (col_ao_output > 0) + { + PetscIntKokkosViewHost garray_h(garray_host, col_ao_output); + // Device to host so don't need to specify exec space + Kokkos::deep_copy(garray_h, garray_d); + size_t bytes = col_ao_output * sizeof(PetscInt); + PetscCallVoid(PetscLogGpuToCpu(bytes)); + } // We can create our nonlocal diagonal block matrix directly on the device PetscCallVoid(MatCreateSeqAIJKokkosWithKokkosViews(PETSC_COMM_SELF, local_rows, col_ao_output, i_nonlocal_d, j_nonlocal_d, a_nonlocal_d, &output_mat_nonlocal)); // We can now create our MPI matrix PetscCallVoid(MatCreateMPIAIJWithSeqAIJ(MPI_COMM_MATRIX, global_rows, global_cols, output_mat_local, output_mat_nonlocal, garray_host, output_mat)); - } - // If in serial + } + // If in serial else { *output_mat = output_mat_local; @@ -1790,27 +1802,73 @@ PETSC_INTERN void MatAXPY_kokkos(Mat *Y, PetscScalar alpha, Mat *X) // Ensure everything is finished before we hit the spadd below Kokkos::fence(); + // ~~~~~~~~~ + // Build merged garray from the union of X and Y global nonlocal column indices. + // Then remap both to local indices so spadd sees correct column numbering. + // ~~~~~~~~~ + + PetscInt nnz_x = xkok_nonlocal->csrmat.nnz(); + PetscInt nnz_y = ykok_nonlocal->csrmat.nnz(); + PetscInt total_nnz_xy = nnz_x + nnz_y; + + // Non-owning views over the raw j data (already holds global indices at this point) + PetscIntKokkosView j_x_view(device_nonlocal_x_j, nnz_x); + PetscIntKokkosView j_y_view(device_nonlocal_y_j, nnz_y); + + PetscIntKokkosView garray_d; + PetscInt col_ao_output = 0; + + if (total_nnz_xy > 0) + { + // Concatenate all global j indices into one array, sort, unique → merged garray + PetscIntKokkosView combined_j_d("combined_j_d", total_nnz_xy); + Kokkos::deep_copy(exec, Kokkos::subview(combined_j_d, Kokkos::make_pair((PetscInt)0, nnz_x)), j_x_view); + Kokkos::deep_copy(exec, Kokkos::subview(combined_j_d, Kokkos::make_pair(nnz_x, total_nnz_xy)), j_y_view); + Kokkos::sort(exec, combined_j_d); + Kokkos::fence(); + + PetscIntKokkosView garray_full_d("garray_full_d", total_nnz_xy); + auto unique_end_it = Kokkos::Experimental::unique_copy(exec, combined_j_d, garray_full_d); + Kokkos::fence(); + col_ao_output = static_cast(unique_end_it - Kokkos::Experimental::begin(garray_full_d)); + PetscInt zero = 0; + garray_d = Kokkos::subview(garray_full_d, Kokkos::make_pair(zero, col_ao_output)); + + // Remap j_y and j_x from global to local indices into the merged garray + // These fence internally + remap_j_to_local_device(j_y_view, garray_d, col_ao_output); + remap_j_to_local_device(j_x_view, garray_d, col_ao_output); + } + // ~~~~~~~~~ Kokkos::View a_nonlocal_d_copy; Kokkos::View i_nonlocal_d_copy, j_nonlocal_d_copy; PetscInt *garray_host = NULL; - PetscInt col_ao_output = 0; - // Scope so the zcsr_nonlocal is destroyed once we copy + // Scope so the zcsr_nonlocal is destroyed once we copy { - // Now we can add the non-local components together + // Create csrmat wrappers for X and Y with the correct merged numCols + KokkosCsrMatrix xcsrmat("x_nonlocal_remapped", local_rows, col_ao_output, + nnz_x, xkok_nonlocal->csrmat.values, + xkok_nonlocal->csrmat.graph.row_map, xkok_nonlocal->csrmat.graph.entries); + KokkosCsrMatrix ycsrmat("y_nonlocal_remapped", local_rows, col_ao_output, + nnz_y, ykok_nonlocal->csrmat.values, + ykok_nonlocal->csrmat.graph.row_map, ykok_nonlocal->csrmat.graph.entries); + + Kokkos::fence(); + // Now we can add the non-local components together. + // Local indices into the merged sorted garray preserve row-sort order. KokkosCsrMatrix zcsr_nonlocal; - // Global indices are sorted - KernelHandle kh_nonlocal; - kh_nonlocal.create_spadd_handle(true); + KernelHandle kh_nonlocal; + kh_nonlocal.create_spadd_handle(true); - KokkosSparse::spadd_symbolic(exec, &kh_nonlocal, xkok_nonlocal->csrmat, ykok_nonlocal->csrmat, zcsr_nonlocal); - KokkosSparse::spadd_numeric(exec, &kh_nonlocal, alpha, xkok_nonlocal->csrmat, (PetscScalar)1.0, ykok_nonlocal->csrmat, zcsr_nonlocal); + KokkosSparse::spadd_symbolic(exec, &kh_nonlocal, xcsrmat, ycsrmat, zcsr_nonlocal); + KokkosSparse::spadd_numeric(exec, &kh_nonlocal, alpha, xcsrmat, (PetscScalar)1.0, ycsrmat, zcsr_nonlocal); kh_nonlocal.destroy_spadd_handle(); - Kokkos::fence(); + Kokkos::fence(); // Can now destroy the copy PetscCallVoid(MatDestroy(&mat_nonlocal_x_copy)); @@ -1821,22 +1879,23 @@ PETSC_INTERN void MatAXPY_kokkos(Mat *Y, PetscScalar alpha, Mat *X) auto i_nonlocal_d_z = zcsr_nonlocal.graph.row_map; auto j_nonlocal_d_z = zcsr_nonlocal.graph.entries; - // We know the most nonlocal indices we can have are the addition of x and y - // (some might be the same) - PetscInt cols_ao = cols_ao_x + cols_ao_y; - - // ~~~~~~~~~ + // j_nonlocal_d_z already contains local indices; copy garray_d to host + PetscCallVoid(PetscMalloc1(col_ao_output, &garray_host)); + if (col_ao_output > 0) + { + PetscIntKokkosViewHost garray_h(garray_host, col_ao_output); + // Device to host so don't need to specify exec space + Kokkos::deep_copy(garray_h, garray_d); + size_t bytes = col_ao_output * sizeof(PetscInt); + PetscCallVoid(PetscLogGpuToCpu(bytes)); + } // Let's make sure everything on the device is finished - Kokkos::fence(); - - // Now we need to build garray on the host and rewrite the j_nonlocal_d_z indices so they are local - // This fences internally - rewrite_j_global_to_local(cols_ao, col_ao_output, j_nonlocal_d_z, &garray_host); + Kokkos::fence(); a_nonlocal_d_copy = Kokkos::View("a_local_d_copy", a_nonlocal_d_z.extent(0)); i_nonlocal_d_copy = Kokkos::View("i_local_d_copy", i_nonlocal_d_z.extent(0)); - j_nonlocal_d_copy = Kokkos::View("j_local_d_copy", j_nonlocal_d_z.extent(0)); + j_nonlocal_d_copy = Kokkos::View("j_local_d_copy", j_nonlocal_d_z.extent(0)); Kokkos::deep_copy(exec, a_nonlocal_d_copy, a_nonlocal_d_z); Kokkos::deep_copy(exec, i_nonlocal_d_copy, i_nonlocal_d_z); From 93bc84cc3c8acfa6085be882aaaf3bd6aaf9980f Mon Sep 17 00:00:00 2001 From: sdargavi Date: Thu, 2 Apr 2026 20:05:29 +0100 Subject: [PATCH 09/60] Extra fence before spadd --- src/PETSc_Helperk.kokkos.cxx | 1 + 1 file changed, 1 insertion(+) diff --git a/src/PETSc_Helperk.kokkos.cxx b/src/PETSc_Helperk.kokkos.cxx index 8c6fe9ed..942e4310 100644 --- a/src/PETSc_Helperk.kokkos.cxx +++ b/src/PETSc_Helperk.kokkos.cxx @@ -1726,6 +1726,7 @@ PETSC_INTERN void MatAXPY_kokkos(Mat *Y, PetscScalar alpha, Mat *X) // ~~~~~~~~~~~~~~~ // Let's go and add the local components together // ~~~~~~~~~~~~~~~ + Kokkos::fence(); Mat_SeqAIJKokkos *xkok_local, *ykok_local; ykok_local = static_cast(mat_local_y->spptr); From 91d436eaccbfbb8b0cdbc56c5598ea7e265d57ab Mon Sep 17 00:00:00 2001 From: sdargavi Date: Fri, 3 Apr 2026 16:51:59 +0100 Subject: [PATCH 10/60] Only use lvec as leaf vector in scatters --- src/PMISR_Modulek.kokkos.cxx | 62 +++++++++++++++++------------------- 1 file changed, 29 insertions(+), 33 deletions(-) diff --git a/src/PMISR_Modulek.kokkos.cxx b/src/PMISR_Modulek.kokkos.cxx index fed521d6..00c1664a 100644 --- a/src/PMISR_Modulek.kokkos.cxx +++ b/src/PMISR_Modulek.kokkos.cxx @@ -156,10 +156,9 @@ PETSC_INTERN void pmisr_existing_measure_cf_markers_kokkos(Mat *strength_mat, co // ~~~~~~~~~~~~ // Create reusable Vecs for VecScatter inside the loop (cf_markers int → PetscScalar) - Vec scatter_root_vec = NULL, scatter_leaf_vec = NULL; + Vec scatter_root_vec = NULL; if (mpi) { PetscCallVoid(MatCreateVecs(*strength_mat, &scatter_root_vec, NULL)); - PetscCallVoid(VecDuplicate(mat_mpi->lvec, &scatter_leaf_vec)); } // Let's keep track of how many times we go through the loops @@ -195,9 +194,9 @@ PETSC_INTERN void pmisr_existing_measure_cf_markers_kokkos(Mat *strength_mat, co } // Ensure the root buffer is no longer being written before Begin. Kokkos::fence(); - PetscCallVoid(VecScatterBegin(mat_mpi->Mvctx, scatter_root_vec, scatter_leaf_vec, INSERT_VALUES, SCATTER_FORWARD)); + PetscCallVoid(VecScatterBegin(mat_mpi->Mvctx, scatter_root_vec, mat_mpi->lvec, INSERT_VALUES, SCATTER_FORWARD)); // Complete the in-flight forward scatter before reading the receive buffer. - PetscCallVoid(VecScatterEnd(mat_mpi->Mvctx, scatter_root_vec, scatter_leaf_vec, INSERT_VALUES, SCATTER_FORWARD)); + PetscCallVoid(VecScatterEnd(mat_mpi->Mvctx, scatter_root_vec, mat_mpi->lvec, INSERT_VALUES, SCATTER_FORWARD)); } @@ -267,12 +266,12 @@ PETSC_INTERN void pmisr_existing_measure_cf_markers_kokkos(Mat *strength_mat, co // Convert PetscScalar → int after End, when the receive buffer is complete. { ConstPetscScalarKokkosView leaf_scalar_d; - PetscCallVoid(VecGetKokkosView(scatter_leaf_vec, &leaf_scalar_d)); + PetscCallVoid(VecGetKokkosView(mat_mpi->lvec, &leaf_scalar_d)); Kokkos::parallel_for( Kokkos::RangePolicy<>(exec, 0, cols_ao), KOKKOS_LAMBDA(PetscInt i) { cf_markers_nonlocal_d(i) = (int)leaf_scalar_d(i); }); - PetscCallVoid(VecRestoreKokkosView(scatter_leaf_vec, &leaf_scalar_d)); + PetscCallVoid(VecRestoreKokkosView(mat_mpi->lvec, &leaf_scalar_d)); } Kokkos::parallel_for( @@ -362,12 +361,12 @@ PETSC_INTERN void pmisr_existing_measure_cf_markers_kokkos(Mat *strength_mat, co // Convert int → PetscScalar for the leaf (nonlocal) data { PetscScalarKokkosView leaf_scalar_d; - PetscCallVoid(VecGetKokkosViewWrite(scatter_leaf_vec, &leaf_scalar_d)); + PetscCallVoid(VecGetKokkosViewWrite(mat_mpi->lvec, &leaf_scalar_d)); Kokkos::parallel_for( Kokkos::RangePolicy<>(exec, 0, cols_ao), KOKKOS_LAMBDA(PetscInt i) { leaf_scalar_d(i) = (PetscScalar)cf_markers_nonlocal_d(i); }); - PetscCallVoid(VecRestoreKokkosViewWrite(scatter_leaf_vec, &leaf_scalar_d)); + PetscCallVoid(VecRestoreKokkosViewWrite(mat_mpi->lvec, &leaf_scalar_d)); } // Convert int → PetscScalar for the root (local) data { @@ -381,12 +380,12 @@ PETSC_INTERN void pmisr_existing_measure_cf_markers_kokkos(Mat *strength_mat, co } // Ensure send/receive buffers are stable before Begin. Kokkos::fence(); - PetscCallVoid(VecScatterBegin(mat_mpi->Mvctx, scatter_leaf_vec, scatter_root_vec, ADD_VALUES, SCATTER_REVERSE)); + PetscCallVoid(VecScatterBegin(mat_mpi->Mvctx, mat_mpi->lvec, scatter_root_vec, ADD_VALUES, SCATTER_REVERSE)); // Complete reverse scatter before reading reduced root buffer. - PetscCallVoid(VecScatterEnd(mat_mpi->Mvctx, scatter_leaf_vec, scatter_root_vec, ADD_VALUES, SCATTER_REVERSE)); + PetscCallVoid(VecScatterEnd(mat_mpi->Mvctx, mat_mpi->lvec, scatter_root_vec, ADD_VALUES, SCATTER_REVERSE)); // While reverse scatter is in-flight, do local-only updates in cf_markers_temp_d. - // This must not touch scatter_root_vec/scatter_leaf_vec. + // This must not touch scatter_root_vec/mat_mpi->lvec. Kokkos::parallel_for( Kokkos::TeamPolicy<>(exec, local_rows, Kokkos::AUTO()), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) { @@ -484,7 +483,6 @@ PETSC_INTERN void pmisr_existing_measure_cf_markers_kokkos(Mat *strength_mat, co // Cleanup loop Vecs PetscCallVoid(VecDestroy(&scatter_root_vec)); - PetscCallVoid(VecDestroy(&scatter_leaf_vec)); // ~~~~~~~~~ // Now assign our final cf markers @@ -716,10 +714,9 @@ PETSC_INTERN void pmisr_existing_measure_implicit_transpose_kokkos(Mat *strength // ~~~~~~~~~~~~ // Create reusable Vecs for VecScatter inside the loop - Vec scatter_root_vec = NULL, scatter_leaf_vec = NULL; + Vec scatter_root_vec = NULL; if (mpi) { PetscCallVoid(MatCreateVecs(*strength_mat, &scatter_root_vec, NULL)); - PetscCallVoid(VecDuplicate(mat_mpi->lvec, &scatter_leaf_vec)); } // Let's keep track of how many times we go through the loops @@ -754,17 +751,17 @@ PETSC_INTERN void pmisr_existing_measure_implicit_transpose_kokkos(Mat *strength } // Ensure send/receive buffers are stable before Begin. Kokkos::fence(); - PetscCallVoid(VecScatterBegin(mat_mpi->Mvctx, scatter_root_vec, scatter_leaf_vec, INSERT_VALUES, SCATTER_FORWARD)); - PetscCallVoid(VecScatterEnd(mat_mpi->Mvctx, scatter_root_vec, scatter_leaf_vec, INSERT_VALUES, SCATTER_FORWARD)); + PetscCallVoid(VecScatterBegin(mat_mpi->Mvctx, scatter_root_vec, mat_mpi->lvec, INSERT_VALUES, SCATTER_FORWARD)); + PetscCallVoid(VecScatterEnd(mat_mpi->Mvctx, scatter_root_vec, mat_mpi->lvec, INSERT_VALUES, SCATTER_FORWARD)); // Convert PetscScalar → int { ConstPetscScalarKokkosView leaf_scalar_d; - PetscCallVoid(VecGetKokkosView(scatter_leaf_vec, &leaf_scalar_d)); + PetscCallVoid(VecGetKokkosView(mat_mpi->lvec, &leaf_scalar_d)); Kokkos::parallel_for( Kokkos::RangePolicy<>(exec, 0, cols_ao), KOKKOS_LAMBDA(PetscInt i) { cf_markers_nonlocal_d(i) = (int)leaf_scalar_d(i); }); - PetscCallVoid(VecRestoreKokkosView(scatter_leaf_vec, &leaf_scalar_d)); + PetscCallVoid(VecRestoreKokkosView(mat_mpi->lvec, &leaf_scalar_d)); } } @@ -886,12 +883,12 @@ PETSC_INTERN void pmisr_existing_measure_implicit_transpose_kokkos(Mat *strength // (LOR is equivalent to sum when values are 0/1 bools) { PetscScalarKokkosView leaf_scalar_d; - PetscCallVoid(VecGetKokkosViewWrite(scatter_leaf_vec, &leaf_scalar_d)); + PetscCallVoid(VecGetKokkosViewWrite(mat_mpi->lvec, &leaf_scalar_d)); Kokkos::parallel_for( Kokkos::RangePolicy<>(exec, 0, cols_ao), KOKKOS_LAMBDA(PetscInt i) { leaf_scalar_d(i) = veto_nonlocal_d(i) ? 1.0 : 0.0; }); - PetscCallVoid(VecRestoreKokkosViewWrite(scatter_leaf_vec, &leaf_scalar_d)); + PetscCallVoid(VecRestoreKokkosViewWrite(mat_mpi->lvec, &leaf_scalar_d)); } { PetscScalarKokkosView root_scalar_d; @@ -903,9 +900,9 @@ PETSC_INTERN void pmisr_existing_measure_implicit_transpose_kokkos(Mat *strength PetscCallVoid(VecRestoreKokkosViewWrite(scatter_root_vec, &root_scalar_d)); } // Ensure send/receive buffers are stable before Begin. - Kokkos::fence(); - PetscCallVoid(VecScatterBegin(mat_mpi->Mvctx, scatter_leaf_vec, scatter_root_vec, ADD_VALUES, SCATTER_REVERSE)); - PetscCallVoid(VecScatterEnd(mat_mpi->Mvctx, scatter_leaf_vec, scatter_root_vec, ADD_VALUES, SCATTER_REVERSE)); + Kokkos::fence(); + PetscCallVoid(VecScatterBegin(mat_mpi->Mvctx, mat_mpi->lvec, scatter_root_vec, ADD_VALUES, SCATTER_REVERSE)); + PetscCallVoid(VecScatterEnd(mat_mpi->Mvctx, mat_mpi->lvec, scatter_root_vec, ADD_VALUES, SCATTER_REVERSE)); { ConstPetscScalarKokkosView root_scalar_d; PetscCallVoid(VecGetKokkosView(scatter_root_vec, &root_scalar_d)); @@ -1022,16 +1019,16 @@ PETSC_INTERN void pmisr_existing_measure_implicit_transpose_kokkos(Mat *strength } // Ensure send/receive buffers are stable before Begin. Kokkos::fence(); - PetscCallVoid(VecScatterBegin(mat_mpi->Mvctx, scatter_root_vec, scatter_leaf_vec, INSERT_VALUES, SCATTER_FORWARD)); - PetscCallVoid(VecScatterEnd(mat_mpi->Mvctx, scatter_root_vec, scatter_leaf_vec, INSERT_VALUES, SCATTER_FORWARD)); + PetscCallVoid(VecScatterBegin(mat_mpi->Mvctx, scatter_root_vec, mat_mpi->lvec, INSERT_VALUES, SCATTER_FORWARD)); + PetscCallVoid(VecScatterEnd(mat_mpi->Mvctx, scatter_root_vec, mat_mpi->lvec, INSERT_VALUES, SCATTER_FORWARD)); { ConstPetscScalarKokkosView leaf_scalar_d; - PetscCallVoid(VecGetKokkosView(scatter_leaf_vec, &leaf_scalar_d)); + PetscCallVoid(VecGetKokkosView(mat_mpi->lvec, &leaf_scalar_d)); Kokkos::parallel_for( Kokkos::RangePolicy<>(exec, 0, cols_ao), KOKKOS_LAMBDA(PetscInt i) { cf_markers_nonlocal_d(i) = (int)leaf_scalar_d(i); }); - PetscCallVoid(VecRestoreKokkosView(scatter_leaf_vec, &leaf_scalar_d)); + PetscCallVoid(VecRestoreKokkosView(mat_mpi->lvec, &leaf_scalar_d)); } // We use the veto arrays here to do this comms @@ -1066,12 +1063,12 @@ PETSC_INTERN void pmisr_existing_measure_implicit_transpose_kokkos(Mat *strength // Any local node with veto set to true is not in the set { PetscScalarKokkosView leaf_scalar_d; - PetscCallVoid(VecGetKokkosViewWrite(scatter_leaf_vec, &leaf_scalar_d)); + PetscCallVoid(VecGetKokkosViewWrite(mat_mpi->lvec, &leaf_scalar_d)); Kokkos::parallel_for( Kokkos::RangePolicy<>(exec, 0, cols_ao), KOKKOS_LAMBDA(PetscInt i) { leaf_scalar_d(i) = veto_nonlocal_d(i) ? 1.0 : 0.0; }); - PetscCallVoid(VecRestoreKokkosViewWrite(scatter_leaf_vec, &leaf_scalar_d)); + PetscCallVoid(VecRestoreKokkosViewWrite(mat_mpi->lvec, &leaf_scalar_d)); } { PetscScalarKokkosView root_scalar_d; @@ -1083,9 +1080,9 @@ PETSC_INTERN void pmisr_existing_measure_implicit_transpose_kokkos(Mat *strength PetscCallVoid(VecRestoreKokkosViewWrite(scatter_root_vec, &root_scalar_d)); } // Ensure send/receive buffers are stable before Begin. - Kokkos::fence(); - PetscCallVoid(VecScatterBegin(mat_mpi->Mvctx, scatter_leaf_vec, scatter_root_vec, ADD_VALUES, SCATTER_REVERSE)); - PetscCallVoid(VecScatterEnd(mat_mpi->Mvctx, scatter_leaf_vec, scatter_root_vec, ADD_VALUES, SCATTER_REVERSE)); + Kokkos::fence(); + PetscCallVoid(VecScatterBegin(mat_mpi->Mvctx, mat_mpi->lvec, scatter_root_vec, ADD_VALUES, SCATTER_REVERSE)); + PetscCallVoid(VecScatterEnd(mat_mpi->Mvctx, mat_mpi->lvec, scatter_root_vec, ADD_VALUES, SCATTER_REVERSE)); { ConstPetscScalarKokkosView root_scalar_d; PetscCallVoid(VecGetKokkosView(scatter_root_vec, &root_scalar_d)); @@ -1194,7 +1191,6 @@ PETSC_INTERN void pmisr_existing_measure_implicit_transpose_kokkos(Mat *strength // Cleanup loop Vecs PetscCallVoid(VecDestroy(&scatter_root_vec)); - PetscCallVoid(VecDestroy(&scatter_leaf_vec)); // Cleanup the local transposes if (destroy_spst) PetscCallVoid(MatDestroy(&mat_local_spst)); From 3ead62791a328d4d402506832c61290ed8a46b96 Mon Sep 17 00:00:00 2001 From: sdargavi Date: Fri, 3 Apr 2026 18:46:03 +0100 Subject: [PATCH 11/60] Sync every matrix before use in Kokkos --- include/kokkos_helper.hpp | 1 + src/DDC_Modulek.kokkos.cxx | 2 + src/Gmres_Polyk.kokkos.cxx | 2 + src/Grid_Transferk.kokkos.cxx | 6 +++ src/MatDiagDomk.kokkos.cxx | 2 + src/PETSc_Helperk.kokkos.cxx | 81 +++++++++++++++++++++++------------ src/SAI_Zk.kokkos.cxx | 3 ++ 7 files changed, 70 insertions(+), 27 deletions(-) diff --git a/include/kokkos_helper.hpp b/include/kokkos_helper.hpp index 7e120072..a9cddb2a 100644 --- a/include/kokkos_helper.hpp +++ b/include/kokkos_helper.hpp @@ -34,6 +34,7 @@ using Scratch2DScalarView = Kokkos::View; PETSC_INTERN void mat_duplicate_copy_plus_diag_kokkos(Mat *, int, Mat *); +PETSC_INTERN void mat_sync(Mat *); PETSC_INTERN void rewrite_j_global_to_local(PetscInt, PetscInt&, PetscIntKokkosView, PetscInt**); PETSC_INTERN void create_cf_is_device_kokkos(Mat *input_mat, const int match_cf, PetscIntKokkosView &is_local_d); PETSC_INTERN void pmisr_existing_measure_cf_markers_kokkos(Mat *strength_mat, const int max_luby_steps, const int pmis_int, PetscScalarKokkosView &measure_local_d, intKokkosView &cf_markers_d, const int zero_measure_c_point_int); diff --git a/src/DDC_Modulek.kokkos.cxx b/src/DDC_Modulek.kokkos.cxx index 87d0310d..1ca839d9 100644 --- a/src/DDC_Modulek.kokkos.cxx +++ b/src/DDC_Modulek.kokkos.cxx @@ -20,6 +20,8 @@ PETSC_INTERN void ddc_kokkos(Mat *input_mat, const PetscReal fraction_swap, cons intKokkosView cf_markers_d = cf_markers_local_d; PetscScalarKokkosView diag_dom_ratio_d = diag_dom_ratio_local_d; PetscIntKokkosView is_fine_local_d; + // Equivalent to calling MatSeqAIJKokkosSyncDevice which is petsc intern + mat_sync(input_mat); const int match_cf = -1; // F_POINT == -1 create_cf_is_device_kokkos(input_mat, match_cf, is_fine_local_d); diff --git a/src/Gmres_Polyk.kokkos.cxx b/src/Gmres_Polyk.kokkos.cxx index ea0106f3..15ec8e11 100644 --- a/src/Gmres_Polyk.kokkos.cxx +++ b/src/Gmres_Polyk.kokkos.cxx @@ -18,6 +18,8 @@ PETSC_INTERN void mat_mult_powers_share_sparsity_kokkos(Mat *input_mat, const in PetscInt one = 1; bool deallocate_submatrices = false; + mat_sync(input_mat); + PetscCallVoid(MatGetType(*input_mat, &mat_type)); // Are we in parallel? const bool mpi = strcmp(mat_type, MATMPIAIJKOKKOS) == 0; diff --git a/src/Grid_Transferk.kokkos.cxx b/src/Grid_Transferk.kokkos.cxx index d8f92da0..8b63ab16 100644 --- a/src/Grid_Transferk.kokkos.cxx +++ b/src/Grid_Transferk.kokkos.cxx @@ -16,6 +16,8 @@ PETSC_INTERN void generate_one_point_with_one_entry_from_sparse_kokkos(Mat *inpu PetscInt nnzs_match_local, nnzs_match_nonlocal; Mat output_mat_local, output_mat_nonlocal; + mat_sync(input_mat); + PetscCallVoid(MatGetType(*input_mat, &mat_type)); // Are we in parallel? const bool mpi = strcmp(mat_type, MATMPIAIJKOKKOS) == 0; @@ -316,6 +318,8 @@ PETSC_INTERN void compute_P_from_W_kokkos(Mat *input_mat, PetscInt global_row_st PetscInt nnzs_match_local, nnzs_match_nonlocal; Mat output_mat_local, output_mat_nonlocal; + mat_sync(input_mat); + PetscCallVoid(MatGetType(*input_mat, &mat_type)); // Are we in parallel? const bool mpi = strcmp(mat_type, MATMPIAIJKOKKOS) == 0; @@ -742,6 +746,8 @@ PETSC_INTERN void compute_R_from_Z_kokkos(Mat *input_mat, PetscInt global_row_st PetscInt nnzs_match_local, nnzs_match_nonlocal; Mat output_mat_local, output_mat_nonlocal; + mat_sync(input_mat); + PetscCallVoid(MatGetType(*input_mat, &mat_type)); // Are we in parallel? const bool mpi = strcmp(mat_type, MATMPIAIJKOKKOS) == 0; diff --git a/src/MatDiagDomk.kokkos.cxx b/src/MatDiagDomk.kokkos.cxx index ced90f5a..fd95696f 100644 --- a/src/MatDiagDomk.kokkos.cxx +++ b/src/MatDiagDomk.kokkos.cxx @@ -16,6 +16,8 @@ PETSC_INTERN void MatDiagDomRatio_kokkos(Mat *input_mat, PetscReal *max_dd_ratio { PetscInt local_rows, local_cols; + mat_sync(input_mat); + // Are we in parallel? MatType mat_type; MPI_Comm MPI_COMM_MATRIX; diff --git a/src/PETSc_Helperk.kokkos.cxx b/src/PETSc_Helperk.kokkos.cxx index 942e4310..ef9cffe6 100644 --- a/src/PETSc_Helperk.kokkos.cxx +++ b/src/PETSc_Helperk.kokkos.cxx @@ -25,6 +25,44 @@ static PetscErrorCode check_exact_petscint_to_scalar_encoding(PetscInt max_encod //------------------------------------------------------------------------------------------------------------------------ +// Sync the kokkos parts of the matrix to make sure they're up to date +PETSC_INTERN void mat_sync(Mat *X) +{ + MatType mat_type; + PetscCallVoid(MatGetType(*X, &mat_type)); + // Are we in parallel? + const bool mpi = strcmp(mat_type, MATMPIAIJKOKKOS) == 0; + Mat mat_local_x = NULL, mat_nonlocal_x = NULL; + + const PetscInt *colmap_x; + if (mpi) + { + PetscCallVoid(MatMPIAIJGetSeqAIJ(*X, &mat_local_x, &mat_nonlocal_x, &colmap_x)); + } + else + { + mat_local_x = *X; + } + + Mat_SeqAIJKokkos *mat_local_xkok = static_cast(mat_local_x->spptr); + if (mat_local_xkok->a_dual.need_sync_device()) { + mat_local_xkok->a_dual.sync_device(); + mat_local_xkok->transpose_updated = PETSC_FALSE; /* values of the transpose is out-of-date */ + mat_local_xkok->hermitian_updated = PETSC_FALSE; + } + if (mpi) + { + Mat_SeqAIJKokkos *mat_nonlocal_xkok = static_cast(mat_nonlocal_x->spptr); + if (mat_nonlocal_xkok->a_dual.need_sync_device()) { + mat_nonlocal_xkok->a_dual.sync_device(); + mat_nonlocal_xkok->transpose_updated = PETSC_FALSE; /* values of the transpose is out-of-date */ + mat_nonlocal_xkok->hermitian_updated = PETSC_FALSE; + } + } +} + +//------------------------------------------------------------------------------------------------------------------------ + // Remap each entry in j_d from a global index to its local index via binary search into garray_d. // garray_d must be a sorted array of unique global indices. // Fences internally. @@ -119,6 +157,9 @@ PETSC_INTERN void remove_small_from_sparse_kokkos(Mat *input_mat, const PetscRea PetscInt nnzs_match_local, nnzs_match_nonlocal; Mat output_mat_local, output_mat_nonlocal; + // Equivalent to calling MatSeqAIJKokkosSyncDevice which is petsc intern + mat_sync(input_mat); + PetscCallVoid(MatGetType(*input_mat, &mat_type)); // Are we in parallel? const bool mpi = strcmp(mat_type, MATMPIAIJKOKKOS) == 0; @@ -841,6 +882,9 @@ PETSC_INTERN void remove_from_sparse_match_kokkos(Mat *input_mat, Mat *output_ma PetscInt rows_ao_input, cols_ao_input, rows_ao_output, cols_ao_output; MatType mat_type; + // Equivalent to calling MatSeqAIJKokkosSyncDevice which is petsc intern + mat_sync(input_mat); + PetscCallVoid(MatGetType(*input_mat, &mat_type)); // Are we in parallel? const bool mpi = strcmp(mat_type, MATMPIAIJKOKKOS) == 0; @@ -1262,6 +1306,9 @@ PETSC_INTERN void mat_duplicate_copy_plus_diag_kokkos(Mat *input_mat, const int PetscInt nnzs_match_local, nnzs_match_nonlocal; Mat output_mat_local, output_mat_nonlocal; + // Equivalent to calling MatSeqAIJKokkosSyncDevice which is petsc intern + mat_sync(input_mat); + PetscCallVoid(MatGetType(*input_mat, &mat_type)); // Are we in parallel? const bool mpi = strcmp(mat_type, MATMPIAIJKOKKOS) == 0; @@ -1667,33 +1714,10 @@ PETSC_INTERN void MatAXPY_kokkos(Mat *Y, PetscScalar alpha, Mat *X) PetscCallVoid(MatMPIAIJGetSeqAIJ(*Y, &mat_local_y, &mat_nonlocal_y, &colmap_y)); PetscCallVoid(MatMPIAIJGetSeqAIJ(*X, &mat_local_x, &mat_nonlocal_x, &colmap_x)); - Mat_SeqAIJKokkos *mat_local_ykok = static_cast(mat_local_y->spptr); - Mat_SeqAIJKokkos *mat_nonlocal_ykok = static_cast(mat_nonlocal_y->spptr); - Mat_SeqAIJKokkos *mat_local_xkok = static_cast(mat_local_x->spptr); - Mat_SeqAIJKokkos *mat_nonlocal_xkok = static_cast(mat_nonlocal_x->spptr); - // Equivalent to calling MatSeqAIJKokkosSyncDevice which is petsc intern - // We have to make sure the device data is up to date before we do the axpy - if (mat_local_ykok->a_dual.need_sync_device()) { - mat_local_ykok->a_dual.sync_device(); - mat_local_ykok->transpose_updated = PETSC_FALSE; /* values of the transpose is out-of-date */ - mat_local_ykok->hermitian_updated = PETSC_FALSE; - } - if (mat_nonlocal_ykok->a_dual.need_sync_device()) { - mat_nonlocal_ykok->a_dual.sync_device(); - mat_nonlocal_ykok->transpose_updated = PETSC_FALSE; /* values of the transpose is out-of-date */ - mat_nonlocal_ykok->hermitian_updated = PETSC_FALSE; - } - if (mat_local_xkok->a_dual.need_sync_device()) { - mat_local_xkok->a_dual.sync_device(); - mat_local_xkok->transpose_updated = PETSC_FALSE; /* values of the transpose is out-of-date */ - mat_local_xkok->hermitian_updated = PETSC_FALSE; - } - if (mat_nonlocal_xkok->a_dual.need_sync_device()) { - mat_nonlocal_xkok->a_dual.sync_device(); - mat_nonlocal_xkok->transpose_updated = PETSC_FALSE; /* values of the transpose is out-of-date */ - mat_nonlocal_xkok->hermitian_updated = PETSC_FALSE; - } + // We have to make sure the device data is up to date before we do the axpy + mat_sync(X); + mat_sync(Y); PetscInt rows_ao_y, cols_ao_y, rows_ao_x, cols_ao_x; auto exec = PetscGetKokkosExecutionSpace(); @@ -2530,7 +2554,10 @@ PETSC_INTERN void MatCreateSubMatrix_kokkos(Mat *input_mat, IS *is_row, IS *is_c PetscCallVoid(MatGetOwnershipRangeColumn(*input_mat, &global_col_start, &global_col_end_plus_one)); PetscInt global_rows_row, global_cols_col; PetscCallVoid(ISGetSize(*is_row, &global_rows_row)); - PetscCallVoid(ISGetSize(*is_col, &global_cols_col)); + PetscCallVoid(ISGetSize(*is_col, &global_cols_col)); + + // Equivalent to calling MatSeqAIJKokkosSyncDevice which is petsc intern + mat_sync(input_mat); PetscIntKokkosView is_row_d_d, is_col_d_d; const int level_idx = our_level - 1; diff --git a/src/SAI_Zk.kokkos.cxx b/src/SAI_Zk.kokkos.cxx index 9f479b16..409332d2 100644 --- a/src/SAI_Zk.kokkos.cxx +++ b/src/SAI_Zk.kokkos.cxx @@ -24,6 +24,9 @@ PETSC_INTERN void calculate_and_build_sai_z_kokkos(Mat *A_ff, Mat *A_cf, Mat *sp PetscInt one = 1; bool deallocate_submatrices = false; + mat_sync(A_ff); + mat_sync(A_cf); + PetscCallVoid(MatGetType(*A_ff, &mat_type)); // Are we in parallel? const bool mpi = strcmp(mat_type, MATMPIAIJKOKKOS) == 0; From 300bebbb0160a339cbd1d2192905ca443b49a876 Mon Sep 17 00:00:00 2001 From: sdargavi Date: Fri, 3 Apr 2026 19:42:39 +0100 Subject: [PATCH 12/60] Add tracers --- include/kokkos_helper.hpp | 17 +++++++++++++++++ src/DDC_Modulek.kokkos.cxx | 3 ++- src/Device_Datak.kokkos.cxx | 6 ++++++ src/Gmres_Polyk.kokkos.cxx | 1 + src/Grid_Transferk.kokkos.cxx | 3 +++ src/MatDiagDomk.kokkos.cxx | 1 + src/PETSc_Helperk.kokkos.cxx | 13 +++++++++++++ src/PMISR_Modulek.kokkos.cxx | 3 +++ src/SAI_Zk.kokkos.cxx | 1 + src/VecISCopyLocalk.kokkos.cxx | 4 ++++ 10 files changed, 51 insertions(+), 1 deletion(-) diff --git a/include/kokkos_helper.hpp b/include/kokkos_helper.hpp index a9cddb2a..3975a8d3 100644 --- a/include/kokkos_helper.hpp +++ b/include/kokkos_helper.hpp @@ -14,6 +14,23 @@ #include #include #include +#include + +struct PflareKokkosTrace { + const char *name; + PflareKokkosTrace(const char *n) : name(n) { + int rank = 0; + MPI_Comm_rank(MPI_COMM_WORLD, &rank); + printf("[PFLARE kokkos rank=%d] Entering %s\n", rank, name); + fflush(stdout); + } + ~PflareKokkosTrace() { + int rank = 0; + MPI_Comm_rank(MPI_COMM_WORLD, &rank); + printf("[PFLARE kokkos rank=%d] Leaving %s\n", rank, name); + fflush(stdout); + } +}; using DefaultExecutionSpace = Kokkos::DefaultExecutionSpace; using DefaultMemorySpace = Kokkos::DefaultExecutionSpace::memory_space; diff --git a/src/DDC_Modulek.kokkos.cxx b/src/DDC_Modulek.kokkos.cxx index 1ca839d9..d0495156 100644 --- a/src/DDC_Modulek.kokkos.cxx +++ b/src/DDC_Modulek.kokkos.cxx @@ -15,7 +15,8 @@ // You have to explicitly call copy_cf_markers_d2h(cf_markers_local) to do this PETSC_INTERN void ddc_kokkos(Mat *input_mat, const PetscReal fraction_swap, const PetscReal max_dd_ratio, const PetscReal max_dd_ratio_achieved, Mat *aff, PetscReal *random_numbers) { - // Can't use the global directly within the parallel + PflareKokkosTrace _trace("ddc_kokkos"); + // Can't use the global directly within the parallel // regions on the device intKokkosView cf_markers_d = cf_markers_local_d; PetscScalarKokkosView diag_dom_ratio_d = diag_dom_ratio_local_d; diff --git a/src/Device_Datak.kokkos.cxx b/src/Device_Datak.kokkos.cxx index be3b3ca6..8bbd9c4f 100644 --- a/src/Device_Datak.kokkos.cxx +++ b/src/Device_Datak.kokkos.cxx @@ -15,6 +15,7 @@ PetscScalarKokkosView diag_dom_ratio_local_d; // Copy the global cf_markers_local_d back to the host PETSC_INTERN void copy_cf_markers_d2h(int *cf_markers_local) { + PflareKokkosTrace _trace("copy_cf_markers_d2h"); // Host wrapper for cf_markers_local intKokkosViewHost cf_markers_local_h(cf_markers_local, cf_markers_local_d.extent(0)); @@ -33,6 +34,7 @@ PETSC_INTERN void copy_cf_markers_d2h(int *cf_markers_local) // Copy the global diag_dom_ratio_local_d back to the host PETSC_INTERN void copy_diag_dom_ratio_d2h(PetscReal *diag_dom_ratio_local) { + PflareKokkosTrace _trace("copy_diag_dom_ratio_d2h"); // Host wrapper for diag_dom_ratio_local PetscScalarKokkosViewHost diag_dom_ratio_h(diag_dom_ratio_local, diag_dom_ratio_local_d.extent(0)); @@ -51,6 +53,7 @@ PETSC_INTERN void copy_diag_dom_ratio_d2h(PetscReal *diag_dom_ratio_local) // Delete the global cf_markers_local_d PETSC_INTERN void delete_device_cf_markers() { + PflareKokkosTrace _trace("delete_device_cf_markers"); // Delete the device view - this assigns an empty view // and hence the old view has its ref counter decremented cf_markers_local_d = intKokkosView(); @@ -63,6 +66,7 @@ PETSC_INTERN void delete_device_cf_markers() // Delete the global diag_dom_ratio_local_d PETSC_INTERN void delete_device_diag_dom_ratio() { + PflareKokkosTrace _trace("delete_device_diag_dom_ratio"); // Delete the device view - this assigns an empty view // and hence the old view has its ref counter decremented diag_dom_ratio_local_d = PetscScalarKokkosView(); @@ -75,6 +79,7 @@ PETSC_INTERN void delete_device_diag_dom_ratio() // Creates the device local indices for F or C points based on the global cf_markers_local_d PETSC_INTERN void create_cf_is_device_kokkos(Mat *input_mat, const int match_cf, PetscIntKokkosView &is_local_d) { + PflareKokkosTrace _trace("create_cf_is_device_kokkos"); PetscInt local_rows, local_cols; PetscCallVoid(MatGetLocalSize(*input_mat, &local_rows, &local_cols)); auto exec = PetscGetKokkosExecutionSpace(); @@ -134,6 +139,7 @@ PETSC_INTERN void create_cf_is_device_kokkos(Mat *input_mat, const int match_cf, // Creates the host IS is_fine and is_coarse based on the global cf_markers_local_d PETSC_INTERN void create_cf_is_kokkos(Mat *input_mat, IS *is_fine, IS *is_coarse) { + PflareKokkosTrace _trace("create_cf_is_kokkos"); PetscIntKokkosView is_fine_local_d, is_coarse_local_d; MPI_Comm MPI_COMM_MATRIX; PetscCallVoid(PetscObjectGetComm((PetscObject)*input_mat, &MPI_COMM_MATRIX)); diff --git a/src/Gmres_Polyk.kokkos.cxx b/src/Gmres_Polyk.kokkos.cxx index 15ec8e11..ef133a48 100644 --- a/src/Gmres_Polyk.kokkos.cxx +++ b/src/Gmres_Polyk.kokkos.cxx @@ -8,6 +8,7 @@ PETSC_INTERN void mat_mult_powers_share_sparsity_kokkos(Mat *input_mat, const int poly_order, const int poly_sparsity_order, PetscReal *coefficients, \ const int reuse_int_reuse_mat, Mat *reuse_mat, const int reuse_int_cmat, Mat *output_mat) { + PflareKokkosTrace _trace("mat_mult_powers_share_sparsity_kokkos"); MPI_Comm MPI_COMM_MATRIX; PetscInt local_rows, local_cols; PetscInt global_row_start_temp, global_row_end_plus_one_temp; diff --git a/src/Grid_Transferk.kokkos.cxx b/src/Grid_Transferk.kokkos.cxx index 8b63ab16..1fa6ac96 100644 --- a/src/Grid_Transferk.kokkos.cxx +++ b/src/Grid_Transferk.kokkos.cxx @@ -7,6 +7,7 @@ // Generate one point classical prolongator but with kokkos - keeping everything on the device PETSC_INTERN void generate_one_point_with_one_entry_from_sparse_kokkos(Mat *input_mat, Mat *output_mat) { + PflareKokkosTrace _trace("generate_one_point_with_one_entry_from_sparse_kokkos"); MPI_Comm MPI_COMM_MATRIX; PetscInt local_rows, local_cols, global_rows, global_cols; PetscInt global_row_start, global_row_end_plus_one; @@ -308,6 +309,7 @@ PETSC_INTERN void generate_one_point_with_one_entry_from_sparse_kokkos(Mat *inpu PETSC_INTERN void compute_P_from_W_kokkos(Mat *input_mat, PetscInt global_row_start, IS *is_fine, \ IS *is_coarse, int identity_int, int reuse_int, Mat *output_mat) { + PflareKokkosTrace _trace("compute_P_from_W_kokkos"); MPI_Comm MPI_COMM_MATRIX; PetscInt global_row_start_W, global_row_end_plus_one_W; PetscInt global_col_start_W, global_col_end_plus_one_W; @@ -734,6 +736,7 @@ PETSC_INTERN void compute_R_from_Z_kokkos(Mat *input_mat, PetscInt global_row_st IS *is_coarse, IS *orig_fine_col_indices, int identity_int, int reuse_int, int reuse_indices_int, \ Mat *output_mat) { + PflareKokkosTrace _trace("compute_R_from_Z_kokkos"); MPI_Comm MPI_COMM_MATRIX; PetscInt global_row_start_Z, global_row_end_plus_one_Z; PetscInt global_col_start_Z, global_col_end_plus_one_Z; diff --git a/src/MatDiagDomk.kokkos.cxx b/src/MatDiagDomk.kokkos.cxx index fd95696f..8336b40d 100644 --- a/src/MatDiagDomk.kokkos.cxx +++ b/src/MatDiagDomk.kokkos.cxx @@ -14,6 +14,7 @@ // This code is very similar to MatCreateSubMatrix_kokkos PETSC_INTERN void MatDiagDomRatio_kokkos(Mat *input_mat, PetscReal *max_dd_ratio_achieved, PetscInt *local_rows_aff) { + PflareKokkosTrace _trace("MatDiagDomRatio_kokkos"); PetscInt local_rows, local_cols; mat_sync(input_mat); diff --git a/src/PETSc_Helperk.kokkos.cxx b/src/PETSc_Helperk.kokkos.cxx index ef9cffe6..d3a3f54c 100644 --- a/src/PETSc_Helperk.kokkos.cxx +++ b/src/PETSc_Helperk.kokkos.cxx @@ -11,6 +11,7 @@ static PetscErrorCode check_exact_petscint_to_scalar_encoding(PetscInt max_encoded_value, MPI_Comm comm) { PetscFunctionBegin; + PflareKokkosTrace _trace("check_exact_petscint_to_scalar_encoding"); if (max_encoded_value <= 0) PetscFunctionReturn(PETSC_SUCCESS); const int digits = std::numeric_limits::digits; @@ -28,6 +29,7 @@ static PetscErrorCode check_exact_petscint_to_scalar_encoding(PetscInt max_encod // Sync the kokkos parts of the matrix to make sure they're up to date PETSC_INTERN void mat_sync(Mat *X) { + PflareKokkosTrace _trace("mat_sync"); MatType mat_type; PetscCallVoid(MatGetType(*X, &mat_type)); // Are we in parallel? @@ -68,6 +70,7 @@ PETSC_INTERN void mat_sync(Mat *X) // Fences internally. static void remap_j_to_local_device(PetscIntKokkosView j_d, PetscIntKokkosView garray_d, PetscInt col_ao_output) { + PflareKokkosTrace _trace("remap_j_to_local_device"); auto exec = PetscGetKokkosExecutionSpace(); if (j_d.extent(0) == 0) return; @@ -84,6 +87,7 @@ static void remap_j_to_local_device(PetscIntKokkosView j_d, PetscIntKokkosView g // garray_d (out) is a device view of the sorted unique global column indices (size col_ao_output). static void rewrite_j_global_to_local_device(PetscInt colmap_max_size, PetscInt &col_ao_output, PetscIntKokkosView j_nonlocal_d, PetscIntKokkosView &garray_d) { + PflareKokkosTrace _trace("rewrite_j_global_to_local_device"); auto exec = PetscGetKokkosExecutionSpace(); // Need to preallocate to the max size @@ -122,6 +126,7 @@ static void rewrite_j_global_to_local_device(PetscInt colmap_max_size, PetscInt // Generate the colmap and rewrite input global j indices to local given the calculated colmap PETSC_INTERN void rewrite_j_global_to_local(PetscInt colmap_max_size, PetscInt &col_ao_output, PetscIntKokkosView j_nonlocal_d, PetscInt **garray_host) { + PflareKokkosTrace _trace("rewrite_j_global_to_local"); auto exec = PetscGetKokkosExecutionSpace(); PetscIntKokkosView garray_d; @@ -148,6 +153,7 @@ PETSC_INTERN void remove_small_from_sparse_kokkos(Mat *input_mat, const PetscRea const int relative_max_row_tolerance_int, const int lump_int, \ const int allow_drop_diagonal_int, const int allow_diag_strength_int) { + PflareKokkosTrace _trace("remove_small_from_sparse_kokkos"); MPI_Comm MPI_COMM_MATRIX; PetscInt local_rows, local_cols, global_rows, global_cols; PetscInt global_row_start_temp, global_row_end_plus_one_temp; @@ -875,6 +881,7 @@ PETSC_INTERN void remove_small_from_sparse_kokkos(Mat *input_mat, const PetscRea // Drop according to a existing sparsity in output_mat but with kokkos - keeping everything on the device PETSC_INTERN void remove_from_sparse_match_kokkos(Mat *input_mat, Mat *output_mat, const int lump_int, const int alpha_int, const PetscReal alpha) { + PflareKokkosTrace _trace("remove_from_sparse_match_kokkos"); MPI_Comm MPI_COMM_MATRIX; PetscInt local_rows, local_cols, global_rows, global_cols; PetscInt global_row_start_temp, global_row_end_plus_one_temp; @@ -1222,6 +1229,7 @@ PETSC_INTERN void remove_from_sparse_match_kokkos(Mat *input_mat, Mat *output_ma // Set all the values of the matrix to val PETSC_INTERN void MatSetAllValues_kokkos(Mat *input_mat, PetscReal val) { + PflareKokkosTrace _trace("MatSetAllValues_kokkos"); MatType mat_type; PetscCallVoid(MatGetType(*input_mat, &mat_type)); @@ -1296,6 +1304,7 @@ PETSC_INTERN void MatSetAllValues_kokkos(Mat *input_mat, PetscReal val) // Duplicate and copy a matrix ensuring it always has a diagonal but with kokkos - keeping everything on the device PETSC_INTERN void mat_duplicate_copy_plus_diag_kokkos(Mat *input_mat, const int reuse_int, Mat *output_mat) { + PflareKokkosTrace _trace("mat_duplicate_copy_plus_diag_kokkos"); MPI_Comm MPI_COMM_MATRIX; PetscInt global_row_start_temp, global_row_end_plus_one_temp; PetscInt global_col_start_temp, global_col_end_plus_one_temp; @@ -1707,6 +1716,7 @@ PETSC_INTERN void mat_duplicate_copy_plus_diag_kokkos(Mat *input_mat, const int // Does a MatAXPY for a MPIAIJ Kokkos matrix - the petsc version currently uses the host making it very slow PETSC_INTERN void MatAXPY_kokkos(Mat *Y, PetscScalar alpha, Mat *X) { + PflareKokkosTrace _trace("MatAXPY_kokkos"); Mat mat_local_y = NULL, mat_nonlocal_y = NULL; Mat mat_local_x = NULL, mat_nonlocal_x = NULL; @@ -1950,6 +1960,7 @@ PETSC_INTERN void MatAXPY_kokkos(Mat *Y, PetscScalar alpha, Mat *X) // is_col must be sorted PETSC_INTERN void MatCreateSubMatrix_Seq_kokkos(Mat *input_mat, PetscIntKokkosView &is_row_d_d, PetscIntKokkosView &is_col_d_d, const int reuse_int, Mat *output_mat) { + PflareKokkosTrace _trace("MatCreateSubMatrix_Seq_kokkos"); PetscInt local_rows, local_cols; PetscInt nnzs_match_local; auto exec = PetscGetKokkosExecutionSpace(); @@ -2275,6 +2286,7 @@ PETSC_INTERN void MatCreateSubMatrix_Seq_kokkos(Mat *input_mat, PetscIntKokkosVi PETSC_INTERN void MatCreateSubMatrix_kokkos_view(Mat *input_mat, PetscIntKokkosView &is_row_d_d, PetscInt global_rows_row, \ PetscIntKokkosView &is_col_d_d, PetscInt global_cols_col, const int reuse_int, Mat *output_mat) { + PflareKokkosTrace _trace("MatCreateSubMatrix_kokkos_view"); PetscInt local_rows, local_cols; PetscInt global_rows, global_cols; PetscInt global_row_start, global_row_end_plus_one; @@ -2548,6 +2560,7 @@ PETSC_INTERN void MatCreateSubMatrix_kokkos(Mat *input_mat, IS *is_row, IS *is_c const int reuse_int, Mat *output_mat, \ const int our_level, const int is_row_fine_int, const int is_col_fine_int) { + PflareKokkosTrace _trace("MatCreateSubMatrix_kokkos"); PetscInt global_row_start, global_row_end_plus_one; PetscInt global_col_start, global_col_end_plus_one; PetscCallVoid(MatGetOwnershipRange(*input_mat, &global_row_start, &global_row_end_plus_one)); diff --git a/src/PMISR_Modulek.kokkos.cxx b/src/PMISR_Modulek.kokkos.cxx index 00c1664a..b22b6cc1 100644 --- a/src/PMISR_Modulek.kokkos.cxx +++ b/src/PMISR_Modulek.kokkos.cxx @@ -15,6 +15,7 @@ // This mirrors the CPU version pmisr_existing_measure_cf_markers in PMISR_Module.F90 PETSC_INTERN void pmisr_existing_measure_cf_markers_kokkos(Mat *strength_mat, const int max_luby_steps, const int pmis_int, PetscScalarKokkosView &measure_local_d, intKokkosView &cf_markers_d, const int zero_measure_c_point_int) { + PflareKokkosTrace _trace("pmisr_existing_measure_cf_markers_kokkos"); MPI_Comm MPI_COMM_MATRIX; PetscInt local_rows, local_cols, global_rows, global_cols; @@ -519,6 +520,7 @@ PETSC_INTERN void pmisr_existing_measure_cf_markers_kokkos(Mat *strength_mat, co // See the full comments in the CPU version pmisr_existing_measure_implicit_transpose PETSC_INTERN void pmisr_existing_measure_implicit_transpose_kokkos(Mat *strength_mat, const int max_luby_steps, const int pmis_int, PetscScalarKokkosView &measure_local_d, intKokkosView &cf_markers_d, const int zero_measure_c_point_int) { + PflareKokkosTrace _trace("pmisr_existing_measure_implicit_transpose_kokkos"); MPI_Comm MPI_COMM_MATRIX; PetscInt local_rows, local_cols, global_rows, global_cols; @@ -1229,6 +1231,7 @@ PETSC_INTERN void pmisr_existing_measure_implicit_transpose_kokkos(Mat *strength // You have to explicitly call copy_cf_markers_d2h(cf_markers_local) to do this PETSC_INTERN void pmisr_kokkos(Mat *strength_mat, const int max_luby_steps, const int pmis_int, PetscReal *measure_local, const int zero_measure_c_point_int) { + PflareKokkosTrace _trace("pmisr_kokkos"); MPI_Comm MPI_COMM_MATRIX; PetscInt local_rows, local_cols, global_rows, global_cols; diff --git a/src/SAI_Zk.kokkos.cxx b/src/SAI_Zk.kokkos.cxx index 409332d2..dc40190b 100644 --- a/src/SAI_Zk.kokkos.cxx +++ b/src/SAI_Zk.kokkos.cxx @@ -13,6 +13,7 @@ PETSC_INTERN void calculate_and_build_sai_z_kokkos(Mat *A_ff, Mat *A_cf, Mat *sparsity_mat_cf, const int reuse_int_reuse_mat, Mat *reuse_mat, Mat *z_mat) { + PflareKokkosTrace _trace("calculate_and_build_sai_z_kokkos"); MPI_Comm MPI_COMM_MATRIX; PetscInt local_rows_cf, local_cols_cf; PetscInt local_rows_ff, local_cols_ff; diff --git a/src/VecISCopyLocalk.kokkos.cxx b/src/VecISCopyLocalk.kokkos.cxx index 8b653c96..2cd37d96 100644 --- a/src/VecISCopyLocalk.kokkos.cxx +++ b/src/VecISCopyLocalk.kokkos.cxx @@ -13,6 +13,7 @@ int max_levels = -1; // Destroys the data PETSC_INTERN void destroy_VecISCopyLocal_kokkos() { + PflareKokkosTrace _trace("destroy_VecISCopyLocal_kokkos"); if (IS_fine_views_local) { // Will automatically call the destructor on each element delete[] IS_fine_views_local; @@ -31,6 +32,7 @@ PETSC_INTERN void destroy_VecISCopyLocal_kokkos() // Creates the data we need to do the equivalent of veciscopy on local data in kokkos PETSC_INTERN void create_VecISCopyLocal_kokkos(int max_levels_input) { + PflareKokkosTrace _trace("create_VecISCopyLocal_kokkos"); // If not built if (!IS_fine_views_local) { @@ -65,6 +67,7 @@ PETSC_INTERN void create_VecISCopyLocal_kokkos(int max_levels_input) // Copy the input IS's to the device for our_level PETSC_INTERN void set_VecISCopyLocal_kokkos_our_level(int our_level, PetscInt global_row_start, IS *index_fine, IS *index_coarse) { + PflareKokkosTrace _trace("set_VecISCopyLocal_kokkos_our_level"); auto exec = PetscGetKokkosExecutionSpace(); const int level_idx = our_level - 1; @@ -124,6 +127,7 @@ PETSC_INTERN void set_VecISCopyLocal_kokkos_our_level(int our_level, PetscInt gl // Do the equivalent of veciscopy on local data using the IS data on the device PETSC_INTERN void VecISCopyLocal_kokkos(int our_level, int fine_int, Vec *vfull, int mode_int, Vec *vreduced) { + PflareKokkosTrace _trace("VecISCopyLocal_kokkos"); Kokkos::fence(); const int level_idx = our_level - 1; From a822ad53d34f0a664fb41247713d927a8b6d331c Mon Sep 17 00:00:00 2001 From: sdargavi Date: Fri, 3 Apr 2026 19:55:59 +0100 Subject: [PATCH 13/60] Write out to stderr --- include/kokkos_helper.hpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/include/kokkos_helper.hpp b/include/kokkos_helper.hpp index 3975a8d3..d34f8b8f 100644 --- a/include/kokkos_helper.hpp +++ b/include/kokkos_helper.hpp @@ -21,14 +21,14 @@ struct PflareKokkosTrace { PflareKokkosTrace(const char *n) : name(n) { int rank = 0; MPI_Comm_rank(MPI_COMM_WORLD, &rank); - printf("[PFLARE kokkos rank=%d] Entering %s\n", rank, name); - fflush(stdout); + fprintf(stderr, "[PFLARE kokkos rank=%d] Entering %s\n", rank, name); + fflush(stderr); } ~PflareKokkosTrace() { int rank = 0; MPI_Comm_rank(MPI_COMM_WORLD, &rank); - printf("[PFLARE kokkos rank=%d] Leaving %s\n", rank, name); - fflush(stdout); + fprintf(stderr, "[PFLARE kokkos rank=%d] Leaving %s\n", rank, name); + fflush(stderr); } }; From 6bcd65b892c58c6b34ea590303c574dccdd44ec5 Mon Sep 17 00:00:00 2001 From: sdargavi Date: Fri, 3 Apr 2026 20:48:00 +0100 Subject: [PATCH 14/60] Add validation checks in pmisr --- src/PMISR_Modulek.kokkos.cxx | 164 +++++++++++++++++++++++++++++++++++ 1 file changed, 164 insertions(+) diff --git a/src/PMISR_Modulek.kokkos.cxx b/src/PMISR_Modulek.kokkos.cxx index b22b6cc1..181fef76 100644 --- a/src/PMISR_Modulek.kokkos.cxx +++ b/src/PMISR_Modulek.kokkos.cxx @@ -58,6 +58,170 @@ PETSC_INTERN void pmisr_existing_measure_cf_markers_kokkos(Mat *strength_mat, co PetscCallVoid(MatSeqAIJGetCSRAndMemType(mat_local, &device_local_i, &device_local_j, &device_local_vals, &mtype)); if (mpi) PetscCallVoid(MatSeqAIJGetCSRAndMemType(mat_nonlocal, &device_nonlocal_i, &device_nonlocal_j, &device_nonlocal_vals, &mtype)); + // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + // Validation checks (run once before main algorithm loops) + // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + { + int rank_check; + MPI_Comm_rank(MPI_COMM_MATRIX, &rank_check); + bool found_error = false; + + // -- Host-side view extent checks -- + if ((PetscInt)measure_local_d.extent(0) < local_rows) { + fprintf(stderr, "[PFLARE pmisr check rank=%d] INVALID: measure_local_d extent %zu < local_rows %d\n", + rank_check, measure_local_d.extent(0), (int)local_rows); + found_error = true; + } + if ((PetscInt)cf_markers_d.extent(0) < local_rows) { + fprintf(stderr, "[PFLARE pmisr check rank=%d] INVALID: cf_markers_d extent %zu < local_rows %d\n", + rank_check, cf_markers_d.extent(0), (int)local_rows); + found_error = true; + } + + if (mpi) { + // lvec size must match cols_ao + PetscInt lvec_size; + PetscCallVoid(VecGetLocalSize(mat_mpi->lvec, &lvec_size)); + if (lvec_size != cols_ao) { + fprintf(stderr, "[PFLARE pmisr check rank=%d] INVALID: lvec_size %d != cols_ao %d\n", + rank_check, (int)lvec_size, (int)cols_ao); + found_error = true; + } + + // garray: each entry must be in [0, global_cols) and outside local ownership + const PetscInt *garray; + PetscCallVoid(MatMPIAIJGetSeqAIJ(*strength_mat, NULL, NULL, &garray)); + for (PetscInt k = 0; k < cols_ao; k++) { + if (garray[k] < 0 || garray[k] >= global_cols) { + fprintf(stderr, "[PFLARE pmisr check rank=%d] INVALID: garray[%d]=%d out of [0, %d)\n", + rank_check, (int)k, (int)garray[k], (int)global_cols); + found_error = true; + } else if (garray[k] >= global_row_start && garray[k] < global_row_end_plus_one) { + fprintf(stderr, "[PFLARE pmisr check rank=%d] INVALID: garray[%d]=%d is in local ownership [%d, %d)\n", + rank_check, (int)k, (int)garray[k], (int)global_row_start, (int)global_row_end_plus_one); + found_error = true; + } + } + } + + // -- Device-side CSR checks via Kokkos parallel_reduce -- + auto exec_check = PetscGetKokkosExecutionSpace(); + + // Read device_local_i[local_rows] (nnz) to host via a 1-element reduce + PetscInt nnz_local_check = 0; + if (local_rows > 0) { + Kokkos::parallel_reduce(Kokkos::RangePolicy<>(exec_check, 0, 1), + KOKKOS_LAMBDA(PetscInt, PetscInt& v) { v = device_local_i[local_rows]; }, + Kokkos::Max(nnz_local_check)); + } + + // Check device_local_i[0] == 0 + if (local_rows > 0) { + PetscInt local_i_zero; + Kokkos::parallel_reduce(Kokkos::RangePolicy<>(exec_check, 0, 1), + KOKKOS_LAMBDA(PetscInt, PetscInt& v) { v = device_local_i[0]; }, + Kokkos::Max(local_i_zero)); + if (local_i_zero != 0) { + fprintf(stderr, "[PFLARE pmisr check rank=%d] INVALID: device_local_i[0]=%d != 0\n", + rank_check, (int)local_i_zero); + found_error = true; + } + } + + // Check device_local_i is non-decreasing + if (local_rows > 0) { + PetscInt mono_err_local = 0; + Kokkos::parallel_reduce(Kokkos::RangePolicy<>(exec_check, 0, local_rows), + KOKKOS_LAMBDA(PetscInt i, PetscInt& err) { + if (device_local_i[i + 1] < device_local_i[i]) err++; + }, mono_err_local); + if (mono_err_local > 0) { + fprintf(stderr, "[PFLARE pmisr check rank=%d] INVALID: device_local_i is non-monotone (%d violations)\n", + rank_check, (int)mono_err_local); + found_error = true; + } + } + + // Check device_local_j values are in [0, local_cols) + if (nnz_local_check > 0) { + PetscInt j_min_local = local_cols; + PetscInt j_max_local = -1; + Kokkos::parallel_reduce(Kokkos::RangePolicy<>(exec_check, 0, nnz_local_check), + KOKKOS_LAMBDA(PetscInt k, PetscInt& lo) { + if (device_local_j[k] < lo) lo = device_local_j[k]; + }, Kokkos::Min(j_min_local)); + Kokkos::parallel_reduce(Kokkos::RangePolicy<>(exec_check, 0, nnz_local_check), + KOKKOS_LAMBDA(PetscInt k, PetscInt& hi) { + if (device_local_j[k] > hi) hi = device_local_j[k]; + }, Kokkos::Max(j_max_local)); + if (j_min_local < 0 || j_max_local >= local_cols) { + fprintf(stderr, "[PFLARE pmisr check rank=%d] INVALID: device_local_j range [%d, %d] not in [0, %d)\n", + rank_check, (int)j_min_local, (int)j_max_local, (int)local_cols); + found_error = true; + } + } + + if (mpi) { + // Read device_nonlocal_i[local_rows] (nnz) to host + PetscInt nnz_nonlocal_check = 0; + if (local_rows > 0) { + Kokkos::parallel_reduce(Kokkos::RangePolicy<>(exec_check, 0, 1), + KOKKOS_LAMBDA(PetscInt, PetscInt& v) { v = device_nonlocal_i[local_rows]; }, + Kokkos::Max(nnz_nonlocal_check)); + } + + // Check device_nonlocal_i[0] == 0 + if (local_rows > 0) { + PetscInt nonlocal_i_zero; + Kokkos::parallel_reduce(Kokkos::RangePolicy<>(exec_check, 0, 1), + KOKKOS_LAMBDA(PetscInt, PetscInt& v) { v = device_nonlocal_i[0]; }, + Kokkos::Max(nonlocal_i_zero)); + if (nonlocal_i_zero != 0) { + fprintf(stderr, "[PFLARE pmisr check rank=%d] INVALID: device_nonlocal_i[0]=%d != 0\n", + rank_check, (int)nonlocal_i_zero); + found_error = true; + } + } + + // Check device_nonlocal_i is non-decreasing + if (local_rows > 0) { + PetscInt mono_err_nonlocal = 0; + Kokkos::parallel_reduce(Kokkos::RangePolicy<>(exec_check, 0, local_rows), + KOKKOS_LAMBDA(PetscInt i, PetscInt& err) { + if (device_nonlocal_i[i + 1] < device_nonlocal_i[i]) err++; + }, mono_err_nonlocal); + if (mono_err_nonlocal > 0) { + fprintf(stderr, "[PFLARE pmisr check rank=%d] INVALID: device_nonlocal_i is non-monotone (%d violations)\n", + rank_check, (int)mono_err_nonlocal); + found_error = true; + } + } + + // Check device_nonlocal_j values are in [0, cols_ao) + if (nnz_nonlocal_check > 0) { + PetscInt j_min_nonlocal = cols_ao; + PetscInt j_max_nonlocal = -1; + Kokkos::parallel_reduce(Kokkos::RangePolicy<>(exec_check, 0, nnz_nonlocal_check), + KOKKOS_LAMBDA(PetscInt k, PetscInt& lo) { + if (device_nonlocal_j[k] < lo) lo = device_nonlocal_j[k]; + }, Kokkos::Min(j_min_nonlocal)); + Kokkos::parallel_reduce(Kokkos::RangePolicy<>(exec_check, 0, nnz_nonlocal_check), + KOKKOS_LAMBDA(PetscInt k, PetscInt& hi) { + if (device_nonlocal_j[k] > hi) hi = device_nonlocal_j[k]; + }, Kokkos::Max(j_max_nonlocal)); + if (j_min_nonlocal < 0 || j_max_nonlocal >= cols_ao) { + fprintf(stderr, "[PFLARE pmisr check rank=%d] INVALID: device_nonlocal_j range [%d, %d] not in [0, %d)\n", + rank_check, (int)j_min_nonlocal, (int)j_max_nonlocal, (int)cols_ao); + found_error = true; + } + } + } + + fflush(stderr); + if (found_error) PETSCABORT(MPI_COMM_MATRIX, PETSC_ERR_ARG_WRONG); + } + // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + intKokkosView cf_markers_nonlocal_d; // Scratch buffer used for local update bookkeeping during overlap with reverse scatter. intKokkosView cf_markers_temp_d; From da621a60fa8e2fb89dad6b07210939e706c8fc93 Mon Sep 17 00:00:00 2001 From: sdargavi Date: Fri, 3 Apr 2026 21:10:00 +0100 Subject: [PATCH 15/60] Add more printing inside pmisr --- src/PMISR_Modulek.kokkos.cxx | 22 +++++++++++++++++++++- 1 file changed, 21 insertions(+), 1 deletion(-) diff --git a/src/PMISR_Modulek.kokkos.cxx b/src/PMISR_Modulek.kokkos.cxx index 181fef76..568f75da 100644 --- a/src/PMISR_Modulek.kokkos.cxx +++ b/src/PMISR_Modulek.kokkos.cxx @@ -222,6 +222,15 @@ PETSC_INTERN void pmisr_existing_measure_cf_markers_kokkos(Mat *strength_mat, co } // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + // Checkpoint helper: fence then print location to stderr so the last printed + // line before a GPU fault tells us exactly which kernel caused it. + int rank_cp; MPI_Comm_rank(MPI_COMM_MATRIX, &rank_cp); + // loops_through is declared here (not inside the do-while) so PMISR_CP can use it pre-loop. + int loops_through = -1; +#define PMISR_CP(label) do { Kokkos::fence(); \ + fprintf(stderr, "[PFLARE pmisr cp rank=%d iter=%d] " label "\n", rank_cp, loops_through); \ + fflush(stderr); } while(0) + intKokkosView cf_markers_nonlocal_d; // Scratch buffer used for local update bookkeeping during overlap with reverse scatter. intKokkosView cf_markers_temp_d; @@ -264,6 +273,7 @@ PETSC_INTERN void pmisr_existing_measure_cf_markers_kokkos(Mat *strength_mat, co // Initialise the set PetscInt counter_in_set_start = 0; + PMISR_CP("A: before initial parallel_reduce"); // Count how many in the set to begin with and set their CF markers Kokkos::parallel_reduce ("Reduction", Kokkos::RangePolicy<>(exec, 0, local_rows), KOKKOS_LAMBDA (const PetscInt i, PetscInt& update) { // If already assigned by the input @@ -298,6 +308,7 @@ PETSC_INTERN void pmisr_existing_measure_cf_markers_kokkos(Mat *strength_mat, co update++; } }, counter_in_set_start); + PMISR_CP("B: after initial parallel_reduce"); // Check the total number of undecided in parallel PetscInt counter_undecided, counter_parallel; @@ -327,7 +338,6 @@ PETSC_INTERN void pmisr_existing_measure_cf_markers_kokkos(Mat *strength_mat, co } // Let's keep track of how many times we go through the loops - int loops_through = -1; do { // Match the fortran version and include a pre-test on the do-while @@ -350,6 +360,7 @@ PETSC_INTERN void pmisr_existing_measure_cf_markers_kokkos(Mat *strength_mat, co // We write directly from cf_markers_d; no extra send staging is needed. { PetscScalarKokkosView root_scalar_d; + PMISR_CP("C: before fwd scatter kernel"); PetscCallVoid(VecGetKokkosViewWrite(scatter_root_vec, &root_scalar_d)); Kokkos::parallel_for( Kokkos::RangePolicy<>(exec, 0, local_rows), KOKKOS_LAMBDA(PetscInt i) { @@ -372,6 +383,7 @@ PETSC_INTERN void pmisr_existing_measure_cf_markers_kokkos(Mat *strength_mat, co // ~~~~~~~~ // Go and do the local component // ~~~~~~~~ + PMISR_CP("D: before local TeamPolicy kernel"); Kokkos::parallel_for( Kokkos::TeamPolicy<>(exec, local_rows, Kokkos::AUTO()), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) { @@ -431,6 +443,7 @@ PETSC_INTERN void pmisr_existing_measure_cf_markers_kokkos(Mat *strength_mat, co // Convert PetscScalar → int after End, when the receive buffer is complete. { ConstPetscScalarKokkosView leaf_scalar_d; + PMISR_CP("E: before nonlocal convert kernel"); PetscCallVoid(VecGetKokkosView(mat_mpi->lvec, &leaf_scalar_d)); Kokkos::parallel_for( Kokkos::RangePolicy<>(exec, 0, cols_ao), KOKKOS_LAMBDA(PetscInt i) { @@ -439,6 +452,7 @@ PETSC_INTERN void pmisr_existing_measure_cf_markers_kokkos(Mat *strength_mat, co PetscCallVoid(VecRestoreKokkosView(mat_mpi->lvec, &leaf_scalar_d)); } + PMISR_CP("F: before nonlocal TeamPolicy kernel"); Kokkos::parallel_for( Kokkos::TeamPolicy<>(exec, local_rows, Kokkos::AUTO()), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) { @@ -488,6 +502,7 @@ PETSC_INTERN void pmisr_existing_measure_cf_markers_kokkos(Mat *strength_mat, co if (mpi) { + PMISR_CP("G: before reverse scatter setup"); // We're going to do an add reverse scatter, so set them to zero Kokkos::deep_copy(exec, cf_markers_nonlocal_d, 0); @@ -522,6 +537,7 @@ PETSC_INTERN void pmisr_existing_measure_cf_markers_kokkos(Mat *strength_mat, co // We've updated the values in cf_markers_nonlocal // Calling a reverse scatter add will then update the values of cf_markers_local + PMISR_CP("H: before rev scatter convert kernels"); // Reduce with a sum via VecScatter with ADD_VALUES, SCATTER_REVERSE // Convert int → PetscScalar for the leaf (nonlocal) data { @@ -545,11 +561,13 @@ PETSC_INTERN void pmisr_existing_measure_cf_markers_kokkos(Mat *strength_mat, co } // Ensure send/receive buffers are stable before Begin. Kokkos::fence(); + fprintf(stderr, "[PFLARE pmisr cp rank=%d iter=%d] I: before rev VecScatterBegin\n", rank_cp, loops_through); fflush(stderr); PetscCallVoid(VecScatterBegin(mat_mpi->Mvctx, mat_mpi->lvec, scatter_root_vec, ADD_VALUES, SCATTER_REVERSE)); // Complete reverse scatter before reading reduced root buffer. PetscCallVoid(VecScatterEnd(mat_mpi->Mvctx, mat_mpi->lvec, scatter_root_vec, ADD_VALUES, SCATTER_REVERSE)); // While reverse scatter is in-flight, do local-only updates in cf_markers_temp_d. + PMISR_CP("J: before overlap TeamPolicy kernel"); // This must not touch scatter_root_vec/mat_mpi->lvec. Kokkos::parallel_for( Kokkos::TeamPolicy<>(exec, local_rows, Kokkos::AUTO()), @@ -577,6 +595,7 @@ PETSC_INTERN void pmisr_existing_measure_cf_markers_kokkos(Mat *strength_mat, co }); // Convert PetscScalar → int back to cf_markers_d after End. + PMISR_CP("K: before root->cf_markers_d convert"); { ConstPetscScalarKokkosView root_scalar_d; PetscCallVoid(VecGetKokkosView(scatter_root_vec, &root_scalar_d)); @@ -588,6 +607,7 @@ PETSC_INTERN void pmisr_existing_measure_cf_markers_kokkos(Mat *strength_mat, co } // Merge the local updates after the VecScatter reduction has completed. + PMISR_CP("L: before merge kernel"); Kokkos::parallel_for( Kokkos::RangePolicy<>(exec, 0, local_rows), KOKKOS_LAMBDA(PetscInt i) { From 1f97f0746e99ee41398fcfce306697cd8ff48616 Mon Sep 17 00:00:00 2001 From: sdargavi Date: Fri, 3 Apr 2026 21:22:19 +0100 Subject: [PATCH 16/60] remove extra fence in print --- src/PMISR_Modulek.kokkos.cxx | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/PMISR_Modulek.kokkos.cxx b/src/PMISR_Modulek.kokkos.cxx index 568f75da..9a1138de 100644 --- a/src/PMISR_Modulek.kokkos.cxx +++ b/src/PMISR_Modulek.kokkos.cxx @@ -227,8 +227,7 @@ PETSC_INTERN void pmisr_existing_measure_cf_markers_kokkos(Mat *strength_mat, co int rank_cp; MPI_Comm_rank(MPI_COMM_MATRIX, &rank_cp); // loops_through is declared here (not inside the do-while) so PMISR_CP can use it pre-loop. int loops_through = -1; -#define PMISR_CP(label) do { Kokkos::fence(); \ - fprintf(stderr, "[PFLARE pmisr cp rank=%d iter=%d] " label "\n", rank_cp, loops_through); \ +#define PMISR_CP(label) do { fprintf(stderr, "[PFLARE pmisr cp rank=%d iter=%d] " label "\n", rank_cp, loops_through); \ fflush(stderr); } while(0) intKokkosView cf_markers_nonlocal_d; From 3b50ce9c75d60480d363b0e9a278cdfa274b39b9 Mon Sep 17 00:00:00 2001 From: sdargavi Date: Fri, 3 Apr 2026 21:54:57 +0100 Subject: [PATCH 17/60] Extra debugging --- src/PMISR_Modulek.kokkos.cxx | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/PMISR_Modulek.kokkos.cxx b/src/PMISR_Modulek.kokkos.cxx index 9a1138de..418d2bbe 100644 --- a/src/PMISR_Modulek.kokkos.cxx +++ b/src/PMISR_Modulek.kokkos.cxx @@ -258,12 +258,16 @@ PETSC_INTERN void pmisr_existing_measure_cf_markers_kokkos(Mat *strength_mat, co PetscCallVoid(VecRestoreKokkosViewWrite(measure_root_vec, &root_scalar_d)); } // Ensure send/receive buffers are stable before Begin. - Kokkos::fence(); + Kokkos::fence(); + fprintf(stderr, "[PFLARE pmisr cp rank=%d] A-pre0: after line-261 fence, before VecScatterBegin\n", rank_cp); fflush(stderr); PetscCallVoid(VecScatterBegin(mat_mpi->Mvctx, measure_root_vec, mat_mpi->lvec, INSERT_VALUES, SCATTER_FORWARD)); PetscCallVoid(VecScatterEnd(mat_mpi->Mvctx, measure_root_vec, mat_mpi->lvec, INSERT_VALUES, SCATTER_FORWARD)); + fprintf(stderr, "[PFLARE pmisr cp rank=%d] A-pre1: after VecScatterEnd, before VecGetKokkosView\n", rank_cp); fflush(stderr); { ConstPetscScalarKokkosView lvec_scalar_d; PetscCallVoid(VecGetKokkosView(mat_mpi->lvec, &lvec_scalar_d)); + fprintf(stderr, "[PFLARE pmisr cp rank=%d] A-pre2: lvec_scalar_d.extent=%zu measure_nonlocal_d.extent=%zu\n", + rank_cp, lvec_scalar_d.extent(0), measure_nonlocal_d.extent(0)); fflush(stderr); Kokkos::deep_copy(exec, measure_nonlocal_d, lvec_scalar_d); PetscCallVoid(VecRestoreKokkosView(mat_mpi->lvec, &lvec_scalar_d)); } From 3d7766e4fa1b63b1f29f42159e8e30b152e31754 Mon Sep 17 00:00:00 2001 From: sdargavi Date: Fri, 3 Apr 2026 22:03:26 +0100 Subject: [PATCH 18/60] Extra writes --- src/PMISR_Modulek.kokkos.cxx | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/src/PMISR_Modulek.kokkos.cxx b/src/PMISR_Modulek.kokkos.cxx index 418d2bbe..de0a963f 100644 --- a/src/PMISR_Modulek.kokkos.cxx +++ b/src/PMISR_Modulek.kokkos.cxx @@ -259,8 +259,14 @@ PETSC_INTERN void pmisr_existing_measure_cf_markers_kokkos(Mat *strength_mat, co } // Ensure send/receive buffers are stable before Begin. Kokkos::fence(); - fprintf(stderr, "[PFLARE pmisr cp rank=%d] A-pre0: after line-261 fence, before VecScatterBegin\n", rank_cp); fflush(stderr); + { + VecType lvec_type; PetscCallVoid(VecGetType(mat_mpi->lvec, &lvec_type)); + PetscInt lvec_sz; PetscCallVoid(VecGetLocalSize(mat_mpi->lvec, &lvec_sz)); + fprintf(stderr, "[PFLARE pmisr cp rank=%d] A-pre0: lvec_type=%s lvec_sz=%d cols_ao=%d\n", + rank_cp, lvec_type, (int)lvec_sz, (int)cols_ao); fflush(stderr); + } PetscCallVoid(VecScatterBegin(mat_mpi->Mvctx, measure_root_vec, mat_mpi->lvec, INSERT_VALUES, SCATTER_FORWARD)); + fprintf(stderr, "[PFLARE pmisr cp rank=%d] A-pre0b: after VecScatterBegin\n", rank_cp); fflush(stderr); PetscCallVoid(VecScatterEnd(mat_mpi->Mvctx, measure_root_vec, mat_mpi->lvec, INSERT_VALUES, SCATTER_FORWARD)); fprintf(stderr, "[PFLARE pmisr cp rank=%d] A-pre1: after VecScatterEnd, before VecGetKokkosView\n", rank_cp); fflush(stderr); { From aee4f2139368f9726b421cad62d1e2eec4bc833b Mon Sep 17 00:00:00 2001 From: sdargavi Date: Fri, 3 Apr 2026 22:27:58 +0100 Subject: [PATCH 19/60] Fence between start/end --- src/PMISR_Modulek.kokkos.cxx | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/PMISR_Modulek.kokkos.cxx b/src/PMISR_Modulek.kokkos.cxx index de0a963f..12e00285 100644 --- a/src/PMISR_Modulek.kokkos.cxx +++ b/src/PMISR_Modulek.kokkos.cxx @@ -267,6 +267,8 @@ PETSC_INTERN void pmisr_existing_measure_cf_markers_kokkos(Mat *strength_mat, co } PetscCallVoid(VecScatterBegin(mat_mpi->Mvctx, measure_root_vec, mat_mpi->lvec, INSERT_VALUES, SCATTER_FORWARD)); fprintf(stderr, "[PFLARE pmisr cp rank=%d] A-pre0b: after VecScatterBegin\n", rank_cp); fflush(stderr); + Kokkos::fence(); + fprintf(stderr, "[PFLARE pmisr cp rank=%d] A-pre0c: after fence between Begin and End\n", rank_cp); fflush(stderr); PetscCallVoid(VecScatterEnd(mat_mpi->Mvctx, measure_root_vec, mat_mpi->lvec, INSERT_VALUES, SCATTER_FORWARD)); fprintf(stderr, "[PFLARE pmisr cp rank=%d] A-pre1: after VecScatterEnd, before VecGetKokkosView\n", rank_cp); fflush(stderr); { From 333019cc4a31084fd3688d583a0aa74e9d597d29 Mon Sep 17 00:00:00 2001 From: sdargavi Date: Sat, 4 Apr 2026 00:08:25 +0100 Subject: [PATCH 20/60] extra fence --- src/PMISR_Modulek.kokkos.cxx | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/PMISR_Modulek.kokkos.cxx b/src/PMISR_Modulek.kokkos.cxx index 12e00285..9d00141d 100644 --- a/src/PMISR_Modulek.kokkos.cxx +++ b/src/PMISR_Modulek.kokkos.cxx @@ -270,6 +270,8 @@ PETSC_INTERN void pmisr_existing_measure_cf_markers_kokkos(Mat *strength_mat, co Kokkos::fence(); fprintf(stderr, "[PFLARE pmisr cp rank=%d] A-pre0c: after fence between Begin and End\n", rank_cp); fflush(stderr); PetscCallVoid(VecScatterEnd(mat_mpi->Mvctx, measure_root_vec, mat_mpi->lvec, INSERT_VALUES, SCATTER_FORWARD)); + Kokkos::fence(); + fprintf(stderr, "[PFLARE pmisr cp rank=%d] A-pre0d: after fence post VecScatterEnd\n", rank_cp); fflush(stderr); fprintf(stderr, "[PFLARE pmisr cp rank=%d] A-pre1: after VecScatterEnd, before VecGetKokkosView\n", rank_cp); fflush(stderr); { ConstPetscScalarKokkosView lvec_scalar_d; From 9466d26f98ac35529d41ecd96cd0e030d37bd23a Mon Sep 17 00:00:00 2001 From: sdargavi Date: Sat, 4 Apr 2026 00:16:15 +0100 Subject: [PATCH 21/60] print sf details --- src/PMISR_Modulek.kokkos.cxx | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/src/PMISR_Modulek.kokkos.cxx b/src/PMISR_Modulek.kokkos.cxx index 9d00141d..a966d39b 100644 --- a/src/PMISR_Modulek.kokkos.cxx +++ b/src/PMISR_Modulek.kokkos.cxx @@ -262,8 +262,11 @@ PETSC_INTERN void pmisr_existing_measure_cf_markers_kokkos(Mat *strength_mat, co { VecType lvec_type; PetscCallVoid(VecGetType(mat_mpi->lvec, &lvec_type)); PetscInt lvec_sz; PetscCallVoid(VecGetLocalSize(mat_mpi->lvec, &lvec_sz)); - fprintf(stderr, "[PFLARE pmisr cp rank=%d] A-pre0: lvec_type=%s lvec_sz=%d cols_ao=%d\n", - rank_cp, lvec_type, (int)lvec_sz, (int)cols_ao); fflush(stderr); + PetscInt root_sz; PetscCallVoid(VecGetLocalSize(measure_root_vec, &root_sz)); + PetscInt sf_nroots, sf_nleaves; + PetscCallVoid(PetscSFGetGraph(mat_mpi->Mvctx, &sf_nroots, &sf_nleaves, NULL, NULL)); + fprintf(stderr, "[PFLARE pmisr cp rank=%d] A-pre0: lvec_type=%s lvec_sz=%d cols_ao=%d root_sz=%d sf_nroots=%d sf_nleaves=%d\n", + rank_cp, lvec_type, (int)lvec_sz, (int)cols_ao, (int)root_sz, (int)sf_nroots, (int)sf_nleaves); fflush(stderr); } PetscCallVoid(VecScatterBegin(mat_mpi->Mvctx, measure_root_vec, mat_mpi->lvec, INSERT_VALUES, SCATTER_FORWARD)); fprintf(stderr, "[PFLARE pmisr cp rank=%d] A-pre0b: after VecScatterBegin\n", rank_cp); fflush(stderr); From 8b36f17af5f029e4a1c2fa218d22711d4fbb08fa Mon Sep 17 00:00:00 2001 From: sdargavi Date: Sat, 4 Apr 2026 00:37:29 +0100 Subject: [PATCH 22/60] print out lvec kokkos data pointer --- src/PMISR_Modulek.kokkos.cxx | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/src/PMISR_Modulek.kokkos.cxx b/src/PMISR_Modulek.kokkos.cxx index a966d39b..1e6ef65a 100644 --- a/src/PMISR_Modulek.kokkos.cxx +++ b/src/PMISR_Modulek.kokkos.cxx @@ -265,8 +265,13 @@ PETSC_INTERN void pmisr_existing_measure_cf_markers_kokkos(Mat *strength_mat, co PetscInt root_sz; PetscCallVoid(VecGetLocalSize(measure_root_vec, &root_sz)); PetscInt sf_nroots, sf_nleaves; PetscCallVoid(PetscSFGetGraph(mat_mpi->Mvctx, &sf_nroots, &sf_nleaves, NULL, NULL)); - fprintf(stderr, "[PFLARE pmisr cp rank=%d] A-pre0: lvec_type=%s lvec_sz=%d cols_ao=%d root_sz=%d sf_nroots=%d sf_nleaves=%d\n", - rank_cp, lvec_type, (int)lvec_sz, (int)cols_ao, (int)root_sz, (int)sf_nroots, (int)sf_nleaves); fflush(stderr); + PetscScalarKokkosView lvec_write_d; + PetscCallVoid(VecGetKokkosViewWrite(mat_mpi->lvec, &lvec_write_d)); + void *lvec_devptr = (void*)lvec_write_d.data(); + PetscCallVoid(VecRestoreKokkosViewWrite(mat_mpi->lvec, &lvec_write_d)); + VecType root_type; PetscCallVoid(VecGetType(measure_root_vec, &root_type)); + fprintf(stderr, "[PFLARE pmisr cp rank=%d] A-pre0: lvec_type=%s lvec_sz=%d cols_ao=%d root_type=%s root_sz=%d sf_nroots=%d sf_nleaves=%d lvec_devptr=%p\n", + rank_cp, lvec_type, (int)lvec_sz, (int)cols_ao, root_type, (int)root_sz, (int)sf_nroots, (int)sf_nleaves, lvec_devptr); fflush(stderr); } PetscCallVoid(VecScatterBegin(mat_mpi->Mvctx, measure_root_vec, mat_mpi->lvec, INSERT_VALUES, SCATTER_FORWARD)); fprintf(stderr, "[PFLARE pmisr cp rank=%d] A-pre0b: after VecScatterBegin\n", rank_cp); fflush(stderr); From b262c53d40bc2420d478a247d50cbdcc310e96d3 Mon Sep 17 00:00:00 2001 From: sdargavi Date: Sat, 4 Apr 2026 01:44:52 +0100 Subject: [PATCH 23/60] Replace lvec use --- src/MatDiagDomk.kokkos.cxx | 13 +- src/PETSc_Helperk.kokkos.cxx | 15 +- src/PMISR_Modulek.kokkos.cxx | 295 ++++++----------------------------- 3 files changed, 67 insertions(+), 256 deletions(-) diff --git a/src/MatDiagDomk.kokkos.cxx b/src/MatDiagDomk.kokkos.cxx index 8336b40d..b290e6ee 100644 --- a/src/MatDiagDomk.kokkos.cxx +++ b/src/MatDiagDomk.kokkos.cxx @@ -48,6 +48,7 @@ PETSC_INTERN void MatDiagDomRatio_kokkos(Mat *input_mat, PetscReal *max_dd_ratio intKokkosView cf_markers_d = cf_markers_local_d; intKokkosView cf_markers_nonlocal_d; Vec scatter_root_vec = NULL; + Vec scatter_leaf_vec = NULL; PetscIntKokkosView is_fine_local_d; auto exec = PetscGetKokkosExecutionSpace(); @@ -75,6 +76,7 @@ PETSC_INTERN void MatDiagDomRatio_kokkos(Mat *input_mat, PetscReal *max_dd_ratio // Scatter cf_markers via VecScatter (int -> PetscScalar conversion required) PetscCallVoid(MatCreateVecs(*input_mat, &scatter_root_vec, NULL)); + PetscCallVoid(VecDuplicate(mat_mpi->lvec, &scatter_leaf_vec)); { PetscScalarKokkosView root_scalar_d; PetscCallVoid(VecGetKokkosViewWrite(scatter_root_vec, &root_scalar_d)); @@ -88,9 +90,9 @@ PETSC_INTERN void MatDiagDomRatio_kokkos(Mat *input_mat, PetscReal *max_dd_ratio // Start comms, then overlap with local-only work below. // Mvctx must have only one active comm at a time. // Ensure send/receive buffers are stable before Begin. - Kokkos::fence(); - PetscCallVoid(VecScatterBegin(mat_mpi->Mvctx, scatter_root_vec, mat_mpi->lvec, INSERT_VALUES, SCATTER_FORWARD)); - PetscCallVoid(VecScatterEnd(mat_mpi->Mvctx, scatter_root_vec, mat_mpi->lvec, INSERT_VALUES, SCATTER_FORWARD)); + Kokkos::fence(); + PetscCallVoid(VecScatterBegin(mat_mpi->Mvctx, scatter_root_vec, scatter_leaf_vec, INSERT_VALUES, SCATTER_FORWARD)); + PetscCallVoid(VecScatterEnd(mat_mpi->Mvctx, scatter_root_vec, scatter_leaf_vec, INSERT_VALUES, SCATTER_FORWARD)); } // ~~~~~~~~~~~~~~~ @@ -165,14 +167,15 @@ PETSC_INTERN void MatDiagDomRatio_kokkos(Mat *input_mat, PetscReal *max_dd_ratio { { ConstPetscScalarKokkosView lvec_scalar_d; - PetscCallVoid(VecGetKokkosView(mat_mpi->lvec, &lvec_scalar_d)); + PetscCallVoid(VecGetKokkosView(scatter_leaf_vec, &lvec_scalar_d)); Kokkos::parallel_for( Kokkos::RangePolicy<>(exec, 0, cols_ao), KOKKOS_LAMBDA(PetscInt i) { cf_markers_nonlocal_d(i) = (int)lvec_scalar_d(i); }); - PetscCallVoid(VecRestoreKokkosView(mat_mpi->lvec, &lvec_scalar_d)); + PetscCallVoid(VecRestoreKokkosView(scatter_leaf_vec, &lvec_scalar_d)); } PetscCallVoid(VecDestroy(&scatter_root_vec)); + PetscCallVoid(VecDestroy(&scatter_leaf_vec)); Kokkos::fence(); } diff --git a/src/PETSc_Helperk.kokkos.cxx b/src/PETSc_Helperk.kokkos.cxx index d3a3f54c..7065d77b 100644 --- a/src/PETSc_Helperk.kokkos.cxx +++ b/src/PETSc_Helperk.kokkos.cxx @@ -2375,11 +2375,13 @@ PETSC_INTERN void MatCreateSubMatrix_kokkos_view(Mat *input_mat, PetscIntKokkosV /* (2) Scatter x and cmap using Mvctx to get their off-process portions */ // Keep at most one active communication on Mvctx at a time. // While Begin/End is in flight, do not touch the corresponding send/recv buffers. + Vec x_leaf_vec; + PetscCallVoid(VecDuplicate(mat_mpi->lvec, &x_leaf_vec)); // Ensure send/receive buffers are stable before Begin. - Kokkos::fence(); - PetscCallVoid(VecScatterBegin(mat_mpi->Mvctx, x_vec, mat_mpi->lvec, INSERT_VALUES, SCATTER_FORWARD)); - // x scatter completed: mat_mpi->lvec is now safe to read. - PetscCallVoid(VecScatterEnd(mat_mpi->Mvctx, x_vec, mat_mpi->lvec, INSERT_VALUES, SCATTER_FORWARD)); + Kokkos::fence(); + PetscCallVoid(VecScatterBegin(mat_mpi->Mvctx, x_vec, x_leaf_vec, INSERT_VALUES, SCATTER_FORWARD)); + // x scatter completed: x_leaf_vec is now safe to read. + PetscCallVoid(VecScatterEnd(mat_mpi->Mvctx, x_vec, x_leaf_vec, INSERT_VALUES, SCATTER_FORWARD)); // Fill cmap_vec on device: cmap[is_col(i)] = i + isstart, rest = -1 { @@ -2413,7 +2415,7 @@ PETSC_INTERN void MatCreateSubMatrix_kokkos_view(Mat *input_mat, PetscIntKokkosV if (cols_ao > 0) { ConstPetscScalarKokkosView lvec_scalar_d; - PetscCallVoid(VecGetKokkosView(mat_mpi->lvec, &lvec_scalar_d)); + PetscCallVoid(VecGetKokkosView(x_leaf_vec, &lvec_scalar_d)); Kokkos::parallel_reduce("FindMatches", Kokkos::RangePolicy<>(exec, 0, cols_ao), KOKKOS_LAMBDA(const PetscInt i, PetscInt& thread_sum) { @@ -2427,7 +2429,7 @@ PETSC_INTERN void MatCreateSubMatrix_kokkos_view(Mat *input_mat, PetscIntKokkosV Kokkos::Sum(col_ao_output) ); - PetscCallVoid(VecRestoreKokkosView(mat_mpi->lvec, &lvec_scalar_d)); + PetscCallVoid(VecRestoreKokkosView(x_leaf_vec, &lvec_scalar_d)); } // Need to do an exclusive scan on is_col_o_match_d to get the new local indices @@ -2471,6 +2473,7 @@ PETSC_INTERN void MatCreateSubMatrix_kokkos_view(Mat *input_mat, PetscIntKokkosV // Cleanup Vecs PetscCallVoid(VecDestroy(&x_vec)); + PetscCallVoid(VecDestroy(&x_leaf_vec)); PetscCallVoid(VecDestroy(&cmap_vec)); PetscCallVoid(VecDestroy(&lcmap_vec)); } diff --git a/src/PMISR_Modulek.kokkos.cxx b/src/PMISR_Modulek.kokkos.cxx index 1e6ef65a..481c2c77 100644 --- a/src/PMISR_Modulek.kokkos.cxx +++ b/src/PMISR_Modulek.kokkos.cxx @@ -58,177 +58,8 @@ PETSC_INTERN void pmisr_existing_measure_cf_markers_kokkos(Mat *strength_mat, co PetscCallVoid(MatSeqAIJGetCSRAndMemType(mat_local, &device_local_i, &device_local_j, &device_local_vals, &mtype)); if (mpi) PetscCallVoid(MatSeqAIJGetCSRAndMemType(mat_nonlocal, &device_nonlocal_i, &device_nonlocal_j, &device_nonlocal_vals, &mtype)); - // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - // Validation checks (run once before main algorithm loops) - // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - { - int rank_check; - MPI_Comm_rank(MPI_COMM_MATRIX, &rank_check); - bool found_error = false; - - // -- Host-side view extent checks -- - if ((PetscInt)measure_local_d.extent(0) < local_rows) { - fprintf(stderr, "[PFLARE pmisr check rank=%d] INVALID: measure_local_d extent %zu < local_rows %d\n", - rank_check, measure_local_d.extent(0), (int)local_rows); - found_error = true; - } - if ((PetscInt)cf_markers_d.extent(0) < local_rows) { - fprintf(stderr, "[PFLARE pmisr check rank=%d] INVALID: cf_markers_d extent %zu < local_rows %d\n", - rank_check, cf_markers_d.extent(0), (int)local_rows); - found_error = true; - } - - if (mpi) { - // lvec size must match cols_ao - PetscInt lvec_size; - PetscCallVoid(VecGetLocalSize(mat_mpi->lvec, &lvec_size)); - if (lvec_size != cols_ao) { - fprintf(stderr, "[PFLARE pmisr check rank=%d] INVALID: lvec_size %d != cols_ao %d\n", - rank_check, (int)lvec_size, (int)cols_ao); - found_error = true; - } - - // garray: each entry must be in [0, global_cols) and outside local ownership - const PetscInt *garray; - PetscCallVoid(MatMPIAIJGetSeqAIJ(*strength_mat, NULL, NULL, &garray)); - for (PetscInt k = 0; k < cols_ao; k++) { - if (garray[k] < 0 || garray[k] >= global_cols) { - fprintf(stderr, "[PFLARE pmisr check rank=%d] INVALID: garray[%d]=%d out of [0, %d)\n", - rank_check, (int)k, (int)garray[k], (int)global_cols); - found_error = true; - } else if (garray[k] >= global_row_start && garray[k] < global_row_end_plus_one) { - fprintf(stderr, "[PFLARE pmisr check rank=%d] INVALID: garray[%d]=%d is in local ownership [%d, %d)\n", - rank_check, (int)k, (int)garray[k], (int)global_row_start, (int)global_row_end_plus_one); - found_error = true; - } - } - } - - // -- Device-side CSR checks via Kokkos parallel_reduce -- - auto exec_check = PetscGetKokkosExecutionSpace(); - - // Read device_local_i[local_rows] (nnz) to host via a 1-element reduce - PetscInt nnz_local_check = 0; - if (local_rows > 0) { - Kokkos::parallel_reduce(Kokkos::RangePolicy<>(exec_check, 0, 1), - KOKKOS_LAMBDA(PetscInt, PetscInt& v) { v = device_local_i[local_rows]; }, - Kokkos::Max(nnz_local_check)); - } - - // Check device_local_i[0] == 0 - if (local_rows > 0) { - PetscInt local_i_zero; - Kokkos::parallel_reduce(Kokkos::RangePolicy<>(exec_check, 0, 1), - KOKKOS_LAMBDA(PetscInt, PetscInt& v) { v = device_local_i[0]; }, - Kokkos::Max(local_i_zero)); - if (local_i_zero != 0) { - fprintf(stderr, "[PFLARE pmisr check rank=%d] INVALID: device_local_i[0]=%d != 0\n", - rank_check, (int)local_i_zero); - found_error = true; - } - } - - // Check device_local_i is non-decreasing - if (local_rows > 0) { - PetscInt mono_err_local = 0; - Kokkos::parallel_reduce(Kokkos::RangePolicy<>(exec_check, 0, local_rows), - KOKKOS_LAMBDA(PetscInt i, PetscInt& err) { - if (device_local_i[i + 1] < device_local_i[i]) err++; - }, mono_err_local); - if (mono_err_local > 0) { - fprintf(stderr, "[PFLARE pmisr check rank=%d] INVALID: device_local_i is non-monotone (%d violations)\n", - rank_check, (int)mono_err_local); - found_error = true; - } - } - - // Check device_local_j values are in [0, local_cols) - if (nnz_local_check > 0) { - PetscInt j_min_local = local_cols; - PetscInt j_max_local = -1; - Kokkos::parallel_reduce(Kokkos::RangePolicy<>(exec_check, 0, nnz_local_check), - KOKKOS_LAMBDA(PetscInt k, PetscInt& lo) { - if (device_local_j[k] < lo) lo = device_local_j[k]; - }, Kokkos::Min(j_min_local)); - Kokkos::parallel_reduce(Kokkos::RangePolicy<>(exec_check, 0, nnz_local_check), - KOKKOS_LAMBDA(PetscInt k, PetscInt& hi) { - if (device_local_j[k] > hi) hi = device_local_j[k]; - }, Kokkos::Max(j_max_local)); - if (j_min_local < 0 || j_max_local >= local_cols) { - fprintf(stderr, "[PFLARE pmisr check rank=%d] INVALID: device_local_j range [%d, %d] not in [0, %d)\n", - rank_check, (int)j_min_local, (int)j_max_local, (int)local_cols); - found_error = true; - } - } - - if (mpi) { - // Read device_nonlocal_i[local_rows] (nnz) to host - PetscInt nnz_nonlocal_check = 0; - if (local_rows > 0) { - Kokkos::parallel_reduce(Kokkos::RangePolicy<>(exec_check, 0, 1), - KOKKOS_LAMBDA(PetscInt, PetscInt& v) { v = device_nonlocal_i[local_rows]; }, - Kokkos::Max(nnz_nonlocal_check)); - } - - // Check device_nonlocal_i[0] == 0 - if (local_rows > 0) { - PetscInt nonlocal_i_zero; - Kokkos::parallel_reduce(Kokkos::RangePolicy<>(exec_check, 0, 1), - KOKKOS_LAMBDA(PetscInt, PetscInt& v) { v = device_nonlocal_i[0]; }, - Kokkos::Max(nonlocal_i_zero)); - if (nonlocal_i_zero != 0) { - fprintf(stderr, "[PFLARE pmisr check rank=%d] INVALID: device_nonlocal_i[0]=%d != 0\n", - rank_check, (int)nonlocal_i_zero); - found_error = true; - } - } - - // Check device_nonlocal_i is non-decreasing - if (local_rows > 0) { - PetscInt mono_err_nonlocal = 0; - Kokkos::parallel_reduce(Kokkos::RangePolicy<>(exec_check, 0, local_rows), - KOKKOS_LAMBDA(PetscInt i, PetscInt& err) { - if (device_nonlocal_i[i + 1] < device_nonlocal_i[i]) err++; - }, mono_err_nonlocal); - if (mono_err_nonlocal > 0) { - fprintf(stderr, "[PFLARE pmisr check rank=%d] INVALID: device_nonlocal_i is non-monotone (%d violations)\n", - rank_check, (int)mono_err_nonlocal); - found_error = true; - } - } - - // Check device_nonlocal_j values are in [0, cols_ao) - if (nnz_nonlocal_check > 0) { - PetscInt j_min_nonlocal = cols_ao; - PetscInt j_max_nonlocal = -1; - Kokkos::parallel_reduce(Kokkos::RangePolicy<>(exec_check, 0, nnz_nonlocal_check), - KOKKOS_LAMBDA(PetscInt k, PetscInt& lo) { - if (device_nonlocal_j[k] < lo) lo = device_nonlocal_j[k]; - }, Kokkos::Min(j_min_nonlocal)); - Kokkos::parallel_reduce(Kokkos::RangePolicy<>(exec_check, 0, nnz_nonlocal_check), - KOKKOS_LAMBDA(PetscInt k, PetscInt& hi) { - if (device_nonlocal_j[k] > hi) hi = device_nonlocal_j[k]; - }, Kokkos::Max(j_max_nonlocal)); - if (j_min_nonlocal < 0 || j_max_nonlocal >= cols_ao) { - fprintf(stderr, "[PFLARE pmisr check rank=%d] INVALID: device_nonlocal_j range [%d, %d] not in [0, %d)\n", - rank_check, (int)j_min_nonlocal, (int)j_max_nonlocal, (int)cols_ao); - found_error = true; - } - } - } - fflush(stderr); - if (found_error) PETSCABORT(MPI_COMM_MATRIX, PETSC_ERR_ARG_WRONG); - } - // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - // Checkpoint helper: fence then print location to stderr so the last printed - // line before a GPU fault tells us exactly which kernel caused it. - int rank_cp; MPI_Comm_rank(MPI_COMM_MATRIX, &rank_cp); - // loops_through is declared here (not inside the do-while) so PMISR_CP can use it pre-loop. int loops_through = -1; -#define PMISR_CP(label) do { fprintf(stderr, "[PFLARE pmisr cp rank=%d iter=%d] " label "\n", rank_cp, loops_through); \ - fflush(stderr); } while(0) intKokkosView cf_markers_nonlocal_d; // Scratch buffer used for local update bookkeeping during overlap with reverse scatter. @@ -249,8 +80,9 @@ PETSC_INTERN void pmisr_existing_measure_cf_markers_kokkos(Mat *strength_mat, co // Scatter the measure using VecScatter (matching PETSc's own buffer management) if (mpi) { - Vec measure_root_vec; + Vec measure_root_vec, measure_leaf_vec; PetscCallVoid(MatCreateVecs(*strength_mat, &measure_root_vec, NULL)); + PetscCallVoid(VecDuplicate(mat_mpi->lvec, &measure_leaf_vec)); { PetscScalarKokkosView root_scalar_d; PetscCallVoid(VecGetKokkosViewWrite(measure_root_vec, &root_scalar_d)); @@ -259,42 +91,20 @@ PETSC_INTERN void pmisr_existing_measure_cf_markers_kokkos(Mat *strength_mat, co } // Ensure send/receive buffers are stable before Begin. Kokkos::fence(); + PetscCallVoid(VecScatterBegin(mat_mpi->Mvctx, measure_root_vec, measure_leaf_vec, INSERT_VALUES, SCATTER_FORWARD)); + PetscCallVoid(VecScatterEnd(mat_mpi->Mvctx, measure_root_vec, measure_leaf_vec, INSERT_VALUES, SCATTER_FORWARD)); { - VecType lvec_type; PetscCallVoid(VecGetType(mat_mpi->lvec, &lvec_type)); - PetscInt lvec_sz; PetscCallVoid(VecGetLocalSize(mat_mpi->lvec, &lvec_sz)); - PetscInt root_sz; PetscCallVoid(VecGetLocalSize(measure_root_vec, &root_sz)); - PetscInt sf_nroots, sf_nleaves; - PetscCallVoid(PetscSFGetGraph(mat_mpi->Mvctx, &sf_nroots, &sf_nleaves, NULL, NULL)); - PetscScalarKokkosView lvec_write_d; - PetscCallVoid(VecGetKokkosViewWrite(mat_mpi->lvec, &lvec_write_d)); - void *lvec_devptr = (void*)lvec_write_d.data(); - PetscCallVoid(VecRestoreKokkosViewWrite(mat_mpi->lvec, &lvec_write_d)); - VecType root_type; PetscCallVoid(VecGetType(measure_root_vec, &root_type)); - fprintf(stderr, "[PFLARE pmisr cp rank=%d] A-pre0: lvec_type=%s lvec_sz=%d cols_ao=%d root_type=%s root_sz=%d sf_nroots=%d sf_nleaves=%d lvec_devptr=%p\n", - rank_cp, lvec_type, (int)lvec_sz, (int)cols_ao, root_type, (int)root_sz, (int)sf_nroots, (int)sf_nleaves, lvec_devptr); fflush(stderr); - } - PetscCallVoid(VecScatterBegin(mat_mpi->Mvctx, measure_root_vec, mat_mpi->lvec, INSERT_VALUES, SCATTER_FORWARD)); - fprintf(stderr, "[PFLARE pmisr cp rank=%d] A-pre0b: after VecScatterBegin\n", rank_cp); fflush(stderr); - Kokkos::fence(); - fprintf(stderr, "[PFLARE pmisr cp rank=%d] A-pre0c: after fence between Begin and End\n", rank_cp); fflush(stderr); - PetscCallVoid(VecScatterEnd(mat_mpi->Mvctx, measure_root_vec, mat_mpi->lvec, INSERT_VALUES, SCATTER_FORWARD)); - Kokkos::fence(); - fprintf(stderr, "[PFLARE pmisr cp rank=%d] A-pre0d: after fence post VecScatterEnd\n", rank_cp); fflush(stderr); - fprintf(stderr, "[PFLARE pmisr cp rank=%d] A-pre1: after VecScatterEnd, before VecGetKokkosView\n", rank_cp); fflush(stderr); - { - ConstPetscScalarKokkosView lvec_scalar_d; - PetscCallVoid(VecGetKokkosView(mat_mpi->lvec, &lvec_scalar_d)); - fprintf(stderr, "[PFLARE pmisr cp rank=%d] A-pre2: lvec_scalar_d.extent=%zu measure_nonlocal_d.extent=%zu\n", - rank_cp, lvec_scalar_d.extent(0), measure_nonlocal_d.extent(0)); fflush(stderr); - Kokkos::deep_copy(exec, measure_nonlocal_d, lvec_scalar_d); - PetscCallVoid(VecRestoreKokkosView(mat_mpi->lvec, &lvec_scalar_d)); + ConstPetscScalarKokkosView leaf_scalar_d; + PetscCallVoid(VecGetKokkosView(measure_leaf_vec, &leaf_scalar_d)); + Kokkos::deep_copy(exec, measure_nonlocal_d, leaf_scalar_d); + PetscCallVoid(VecRestoreKokkosView(measure_leaf_vec, &leaf_scalar_d)); } PetscCallVoid(VecDestroy(&measure_root_vec)); + PetscCallVoid(VecDestroy(&measure_leaf_vec)); } // Initialise the set PetscInt counter_in_set_start = 0; - PMISR_CP("A: before initial parallel_reduce"); // Count how many in the set to begin with and set their CF markers Kokkos::parallel_reduce ("Reduction", Kokkos::RangePolicy<>(exec, 0, local_rows), KOKKOS_LAMBDA (const PetscInt i, PetscInt& update) { // If already assigned by the input @@ -329,7 +139,6 @@ PETSC_INTERN void pmisr_existing_measure_cf_markers_kokkos(Mat *strength_mat, co update++; } }, counter_in_set_start); - PMISR_CP("B: after initial parallel_reduce"); // Check the total number of undecided in parallel PetscInt counter_undecided, counter_parallel; @@ -353,9 +162,10 @@ PETSC_INTERN void pmisr_existing_measure_cf_markers_kokkos(Mat *strength_mat, co // ~~~~~~~~~~~~ // Create reusable Vecs for VecScatter inside the loop (cf_markers int → PetscScalar) - Vec scatter_root_vec = NULL; + Vec scatter_root_vec = NULL, scatter_leaf_vec = NULL; if (mpi) { PetscCallVoid(MatCreateVecs(*strength_mat, &scatter_root_vec, NULL)); + PetscCallVoid(VecDuplicate(mat_mpi->lvec, &scatter_leaf_vec)); } // Let's keep track of how many times we go through the loops @@ -381,7 +191,6 @@ PETSC_INTERN void pmisr_existing_measure_cf_markers_kokkos(Mat *strength_mat, co // We write directly from cf_markers_d; no extra send staging is needed. { PetscScalarKokkosView root_scalar_d; - PMISR_CP("C: before fwd scatter kernel"); PetscCallVoid(VecGetKokkosViewWrite(scatter_root_vec, &root_scalar_d)); Kokkos::parallel_for( Kokkos::RangePolicy<>(exec, 0, local_rows), KOKKOS_LAMBDA(PetscInt i) { @@ -391,9 +200,9 @@ PETSC_INTERN void pmisr_existing_measure_cf_markers_kokkos(Mat *strength_mat, co } // Ensure the root buffer is no longer being written before Begin. Kokkos::fence(); - PetscCallVoid(VecScatterBegin(mat_mpi->Mvctx, scatter_root_vec, mat_mpi->lvec, INSERT_VALUES, SCATTER_FORWARD)); + PetscCallVoid(VecScatterBegin(mat_mpi->Mvctx, scatter_root_vec, scatter_leaf_vec, INSERT_VALUES, SCATTER_FORWARD)); // Complete the in-flight forward scatter before reading the receive buffer. - PetscCallVoid(VecScatterEnd(mat_mpi->Mvctx, scatter_root_vec, mat_mpi->lvec, INSERT_VALUES, SCATTER_FORWARD)); + PetscCallVoid(VecScatterEnd(mat_mpi->Mvctx, scatter_root_vec, scatter_leaf_vec, INSERT_VALUES, SCATTER_FORWARD)); } @@ -404,7 +213,6 @@ PETSC_INTERN void pmisr_existing_measure_cf_markers_kokkos(Mat *strength_mat, co // ~~~~~~~~ // Go and do the local component // ~~~~~~~~ - PMISR_CP("D: before local TeamPolicy kernel"); Kokkos::parallel_for( Kokkos::TeamPolicy<>(exec, local_rows, Kokkos::AUTO()), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) { @@ -464,16 +272,14 @@ PETSC_INTERN void pmisr_existing_measure_cf_markers_kokkos(Mat *strength_mat, co // Convert PetscScalar → int after End, when the receive buffer is complete. { ConstPetscScalarKokkosView leaf_scalar_d; - PMISR_CP("E: before nonlocal convert kernel"); - PetscCallVoid(VecGetKokkosView(mat_mpi->lvec, &leaf_scalar_d)); + PetscCallVoid(VecGetKokkosView(scatter_leaf_vec, &leaf_scalar_d)); Kokkos::parallel_for( Kokkos::RangePolicy<>(exec, 0, cols_ao), KOKKOS_LAMBDA(PetscInt i) { cf_markers_nonlocal_d(i) = (int)leaf_scalar_d(i); }); - PetscCallVoid(VecRestoreKokkosView(mat_mpi->lvec, &leaf_scalar_d)); + PetscCallVoid(VecRestoreKokkosView(scatter_leaf_vec, &leaf_scalar_d)); } - PMISR_CP("F: before nonlocal TeamPolicy kernel"); Kokkos::parallel_for( Kokkos::TeamPolicy<>(exec, local_rows, Kokkos::AUTO()), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) { @@ -523,7 +329,6 @@ PETSC_INTERN void pmisr_existing_measure_cf_markers_kokkos(Mat *strength_mat, co if (mpi) { - PMISR_CP("G: before reverse scatter setup"); // We're going to do an add reverse scatter, so set them to zero Kokkos::deep_copy(exec, cf_markers_nonlocal_d, 0); @@ -558,17 +363,16 @@ PETSC_INTERN void pmisr_existing_measure_cf_markers_kokkos(Mat *strength_mat, co // We've updated the values in cf_markers_nonlocal // Calling a reverse scatter add will then update the values of cf_markers_local - PMISR_CP("H: before rev scatter convert kernels"); // Reduce with a sum via VecScatter with ADD_VALUES, SCATTER_REVERSE // Convert int → PetscScalar for the leaf (nonlocal) data { PetscScalarKokkosView leaf_scalar_d; - PetscCallVoid(VecGetKokkosViewWrite(mat_mpi->lvec, &leaf_scalar_d)); + PetscCallVoid(VecGetKokkosViewWrite(scatter_leaf_vec, &leaf_scalar_d)); Kokkos::parallel_for( Kokkos::RangePolicy<>(exec, 0, cols_ao), KOKKOS_LAMBDA(PetscInt i) { leaf_scalar_d(i) = (PetscScalar)cf_markers_nonlocal_d(i); }); - PetscCallVoid(VecRestoreKokkosViewWrite(mat_mpi->lvec, &leaf_scalar_d)); + PetscCallVoid(VecRestoreKokkosViewWrite(scatter_leaf_vec, &leaf_scalar_d)); } // Convert int → PetscScalar for the root (local) data { @@ -582,14 +386,12 @@ PETSC_INTERN void pmisr_existing_measure_cf_markers_kokkos(Mat *strength_mat, co } // Ensure send/receive buffers are stable before Begin. Kokkos::fence(); - fprintf(stderr, "[PFLARE pmisr cp rank=%d iter=%d] I: before rev VecScatterBegin\n", rank_cp, loops_through); fflush(stderr); - PetscCallVoid(VecScatterBegin(mat_mpi->Mvctx, mat_mpi->lvec, scatter_root_vec, ADD_VALUES, SCATTER_REVERSE)); + PetscCallVoid(VecScatterBegin(mat_mpi->Mvctx, scatter_leaf_vec, scatter_root_vec, ADD_VALUES, SCATTER_REVERSE)); // Complete reverse scatter before reading reduced root buffer. - PetscCallVoid(VecScatterEnd(mat_mpi->Mvctx, mat_mpi->lvec, scatter_root_vec, ADD_VALUES, SCATTER_REVERSE)); + PetscCallVoid(VecScatterEnd(mat_mpi->Mvctx, scatter_leaf_vec, scatter_root_vec, ADD_VALUES, SCATTER_REVERSE)); // While reverse scatter is in-flight, do local-only updates in cf_markers_temp_d. - PMISR_CP("J: before overlap TeamPolicy kernel"); - // This must not touch scatter_root_vec/mat_mpi->lvec. + // This must not touch scatter_root_vec/scatter_leaf_vec. Kokkos::parallel_for( Kokkos::TeamPolicy<>(exec, local_rows, Kokkos::AUTO()), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) { @@ -616,7 +418,6 @@ PETSC_INTERN void pmisr_existing_measure_cf_markers_kokkos(Mat *strength_mat, co }); // Convert PetscScalar → int back to cf_markers_d after End. - PMISR_CP("K: before root->cf_markers_d convert"); { ConstPetscScalarKokkosView root_scalar_d; PetscCallVoid(VecGetKokkosView(scatter_root_vec, &root_scalar_d)); @@ -628,7 +429,6 @@ PETSC_INTERN void pmisr_existing_measure_cf_markers_kokkos(Mat *strength_mat, co } // Merge the local updates after the VecScatter reduction has completed. - PMISR_CP("L: before merge kernel"); Kokkos::parallel_for( Kokkos::RangePolicy<>(exec, 0, local_rows), KOKKOS_LAMBDA(PetscInt i) { @@ -689,6 +489,7 @@ PETSC_INTERN void pmisr_existing_measure_cf_markers_kokkos(Mat *strength_mat, co // Cleanup loop Vecs PetscCallVoid(VecDestroy(&scatter_root_vec)); + PetscCallVoid(VecDestroy(&scatter_leaf_vec)); // ~~~~~~~~~ // Now assign our final cf markers @@ -839,8 +640,9 @@ PETSC_INTERN void pmisr_existing_measure_implicit_transpose_kokkos(Mat *strength // directly to PetscSF causes intermittent failures in parallel GPU runs (exact cause unknown). if (mpi) { - Vec measure_root_vec; + Vec measure_root_vec, measure_leaf_vec; PetscCallVoid(MatCreateVecs(*strength_mat, &measure_root_vec, NULL)); + PetscCallVoid(VecDuplicate(mat_mpi->lvec, &measure_leaf_vec)); { PetscScalarKokkosView root_scalar_d; PetscCallVoid(VecGetKokkosViewWrite(measure_root_vec, &root_scalar_d)); @@ -848,16 +650,17 @@ PETSC_INTERN void pmisr_existing_measure_implicit_transpose_kokkos(Mat *strength PetscCallVoid(VecRestoreKokkosViewWrite(measure_root_vec, &root_scalar_d)); } // Ensure send/receive buffers are stable before Begin. - Kokkos::fence(); - PetscCallVoid(VecScatterBegin(mat_mpi->Mvctx, measure_root_vec, mat_mpi->lvec, INSERT_VALUES, SCATTER_FORWARD)); - PetscCallVoid(VecScatterEnd(mat_mpi->Mvctx, measure_root_vec, mat_mpi->lvec, INSERT_VALUES, SCATTER_FORWARD)); + Kokkos::fence(); + PetscCallVoid(VecScatterBegin(mat_mpi->Mvctx, measure_root_vec, measure_leaf_vec, INSERT_VALUES, SCATTER_FORWARD)); + PetscCallVoid(VecScatterEnd(mat_mpi->Mvctx, measure_root_vec, measure_leaf_vec, INSERT_VALUES, SCATTER_FORWARD)); { ConstPetscScalarKokkosView lvec_scalar_d; - PetscCallVoid(VecGetKokkosView(mat_mpi->lvec, &lvec_scalar_d)); + PetscCallVoid(VecGetKokkosView(measure_leaf_vec, &lvec_scalar_d)); Kokkos::deep_copy(exec, measure_nonlocal_d, lvec_scalar_d); - PetscCallVoid(VecRestoreKokkosView(mat_mpi->lvec, &lvec_scalar_d)); + PetscCallVoid(VecRestoreKokkosView(measure_leaf_vec, &lvec_scalar_d)); } PetscCallVoid(VecDestroy(&measure_root_vec)); + PetscCallVoid(VecDestroy(&measure_leaf_vec)); } // ~~~~~~~~~~~~ @@ -921,9 +724,10 @@ PETSC_INTERN void pmisr_existing_measure_implicit_transpose_kokkos(Mat *strength // ~~~~~~~~~~~~ // Create reusable Vecs for VecScatter inside the loop - Vec scatter_root_vec = NULL; + Vec scatter_root_vec = NULL, scatter_leaf_vec = NULL; if (mpi) { PetscCallVoid(MatCreateVecs(*strength_mat, &scatter_root_vec, NULL)); + PetscCallVoid(VecDuplicate(mat_mpi->lvec, &scatter_leaf_vec)); } // Let's keep track of how many times we go through the loops @@ -957,18 +761,18 @@ PETSC_INTERN void pmisr_existing_measure_implicit_transpose_kokkos(Mat *strength PetscCallVoid(VecRestoreKokkosViewWrite(scatter_root_vec, &root_scalar_d)); } // Ensure send/receive buffers are stable before Begin. - Kokkos::fence(); - PetscCallVoid(VecScatterBegin(mat_mpi->Mvctx, scatter_root_vec, mat_mpi->lvec, INSERT_VALUES, SCATTER_FORWARD)); - PetscCallVoid(VecScatterEnd(mat_mpi->Mvctx, scatter_root_vec, mat_mpi->lvec, INSERT_VALUES, SCATTER_FORWARD)); + Kokkos::fence(); + PetscCallVoid(VecScatterBegin(mat_mpi->Mvctx, scatter_root_vec, scatter_leaf_vec, INSERT_VALUES, SCATTER_FORWARD)); + PetscCallVoid(VecScatterEnd(mat_mpi->Mvctx, scatter_root_vec, scatter_leaf_vec, INSERT_VALUES, SCATTER_FORWARD)); // Convert PetscScalar → int { ConstPetscScalarKokkosView leaf_scalar_d; - PetscCallVoid(VecGetKokkosView(mat_mpi->lvec, &leaf_scalar_d)); + PetscCallVoid(VecGetKokkosView(scatter_leaf_vec, &leaf_scalar_d)); Kokkos::parallel_for( Kokkos::RangePolicy<>(exec, 0, cols_ao), KOKKOS_LAMBDA(PetscInt i) { cf_markers_nonlocal_d(i) = (int)leaf_scalar_d(i); }); - PetscCallVoid(VecRestoreKokkosView(mat_mpi->lvec, &leaf_scalar_d)); + PetscCallVoid(VecRestoreKokkosView(scatter_leaf_vec, &leaf_scalar_d)); } } @@ -1090,12 +894,12 @@ PETSC_INTERN void pmisr_existing_measure_implicit_transpose_kokkos(Mat *strength // (LOR is equivalent to sum when values are 0/1 bools) { PetscScalarKokkosView leaf_scalar_d; - PetscCallVoid(VecGetKokkosViewWrite(mat_mpi->lvec, &leaf_scalar_d)); + PetscCallVoid(VecGetKokkosViewWrite(scatter_leaf_vec, &leaf_scalar_d)); Kokkos::parallel_for( Kokkos::RangePolicy<>(exec, 0, cols_ao), KOKKOS_LAMBDA(PetscInt i) { leaf_scalar_d(i) = veto_nonlocal_d(i) ? 1.0 : 0.0; }); - PetscCallVoid(VecRestoreKokkosViewWrite(mat_mpi->lvec, &leaf_scalar_d)); + PetscCallVoid(VecRestoreKokkosViewWrite(scatter_leaf_vec, &leaf_scalar_d)); } { PetscScalarKokkosView root_scalar_d; @@ -1108,8 +912,8 @@ PETSC_INTERN void pmisr_existing_measure_implicit_transpose_kokkos(Mat *strength } // Ensure send/receive buffers are stable before Begin. Kokkos::fence(); - PetscCallVoid(VecScatterBegin(mat_mpi->Mvctx, mat_mpi->lvec, scatter_root_vec, ADD_VALUES, SCATTER_REVERSE)); - PetscCallVoid(VecScatterEnd(mat_mpi->Mvctx, mat_mpi->lvec, scatter_root_vec, ADD_VALUES, SCATTER_REVERSE)); + PetscCallVoid(VecScatterBegin(mat_mpi->Mvctx, scatter_leaf_vec, scatter_root_vec, ADD_VALUES, SCATTER_REVERSE)); + PetscCallVoid(VecScatterEnd(mat_mpi->Mvctx, scatter_leaf_vec, scatter_root_vec, ADD_VALUES, SCATTER_REVERSE)); { ConstPetscScalarKokkosView root_scalar_d; PetscCallVoid(VecGetKokkosView(scatter_root_vec, &root_scalar_d)); @@ -1225,17 +1029,17 @@ PETSC_INTERN void pmisr_existing_measure_implicit_transpose_kokkos(Mat *strength PetscCallVoid(VecRestoreKokkosViewWrite(scatter_root_vec, &root_scalar_d)); } // Ensure send/receive buffers are stable before Begin. - Kokkos::fence(); - PetscCallVoid(VecScatterBegin(mat_mpi->Mvctx, scatter_root_vec, mat_mpi->lvec, INSERT_VALUES, SCATTER_FORWARD)); - PetscCallVoid(VecScatterEnd(mat_mpi->Mvctx, scatter_root_vec, mat_mpi->lvec, INSERT_VALUES, SCATTER_FORWARD)); + Kokkos::fence(); + PetscCallVoid(VecScatterBegin(mat_mpi->Mvctx, scatter_root_vec, scatter_leaf_vec, INSERT_VALUES, SCATTER_FORWARD)); + PetscCallVoid(VecScatterEnd(mat_mpi->Mvctx, scatter_root_vec, scatter_leaf_vec, INSERT_VALUES, SCATTER_FORWARD)); { ConstPetscScalarKokkosView leaf_scalar_d; - PetscCallVoid(VecGetKokkosView(mat_mpi->lvec, &leaf_scalar_d)); + PetscCallVoid(VecGetKokkosView(scatter_leaf_vec, &leaf_scalar_d)); Kokkos::parallel_for( Kokkos::RangePolicy<>(exec, 0, cols_ao), KOKKOS_LAMBDA(PetscInt i) { cf_markers_nonlocal_d(i) = (int)leaf_scalar_d(i); }); - PetscCallVoid(VecRestoreKokkosView(mat_mpi->lvec, &leaf_scalar_d)); + PetscCallVoid(VecRestoreKokkosView(scatter_leaf_vec, &leaf_scalar_d)); } // We use the veto arrays here to do this comms @@ -1270,12 +1074,12 @@ PETSC_INTERN void pmisr_existing_measure_implicit_transpose_kokkos(Mat *strength // Any local node with veto set to true is not in the set { PetscScalarKokkosView leaf_scalar_d; - PetscCallVoid(VecGetKokkosViewWrite(mat_mpi->lvec, &leaf_scalar_d)); + PetscCallVoid(VecGetKokkosViewWrite(scatter_leaf_vec, &leaf_scalar_d)); Kokkos::parallel_for( Kokkos::RangePolicy<>(exec, 0, cols_ao), KOKKOS_LAMBDA(PetscInt i) { leaf_scalar_d(i) = veto_nonlocal_d(i) ? 1.0 : 0.0; }); - PetscCallVoid(VecRestoreKokkosViewWrite(mat_mpi->lvec, &leaf_scalar_d)); + PetscCallVoid(VecRestoreKokkosViewWrite(scatter_leaf_vec, &leaf_scalar_d)); } { PetscScalarKokkosView root_scalar_d; @@ -1288,8 +1092,8 @@ PETSC_INTERN void pmisr_existing_measure_implicit_transpose_kokkos(Mat *strength } // Ensure send/receive buffers are stable before Begin. Kokkos::fence(); - PetscCallVoid(VecScatterBegin(mat_mpi->Mvctx, mat_mpi->lvec, scatter_root_vec, ADD_VALUES, SCATTER_REVERSE)); - PetscCallVoid(VecScatterEnd(mat_mpi->Mvctx, mat_mpi->lvec, scatter_root_vec, ADD_VALUES, SCATTER_REVERSE)); + PetscCallVoid(VecScatterBegin(mat_mpi->Mvctx, scatter_leaf_vec, scatter_root_vec, ADD_VALUES, SCATTER_REVERSE)); + PetscCallVoid(VecScatterEnd(mat_mpi->Mvctx, scatter_leaf_vec, scatter_root_vec, ADD_VALUES, SCATTER_REVERSE)); { ConstPetscScalarKokkosView root_scalar_d; PetscCallVoid(VecGetKokkosView(scatter_root_vec, &root_scalar_d)); @@ -1398,6 +1202,7 @@ PETSC_INTERN void pmisr_existing_measure_implicit_transpose_kokkos(Mat *strength // Cleanup loop Vecs PetscCallVoid(VecDestroy(&scatter_root_vec)); + PetscCallVoid(VecDestroy(&scatter_leaf_vec)); // Cleanup the local transposes if (destroy_spst) PetscCallVoid(MatDestroy(&mat_local_spst)); From 2156999ee77092a818aa21d9674fcd1645794bfc Mon Sep 17 00:00:00 2001 From: sdargavi Date: Sat, 4 Apr 2026 23:29:00 +0100 Subject: [PATCH 24/60] Add exec and fence to d2h copies --- src/Device_Datak.kokkos.cxx | 18 +++++++++++++----- src/PETSc_Helperk.kokkos.cxx | 15 ++++++++++----- 2 files changed, 23 insertions(+), 10 deletions(-) diff --git a/src/Device_Datak.kokkos.cxx b/src/Device_Datak.kokkos.cxx index 8bbd9c4f..0e1e3b83 100644 --- a/src/Device_Datak.kokkos.cxx +++ b/src/Device_Datak.kokkos.cxx @@ -19,9 +19,12 @@ PETSC_INTERN void copy_cf_markers_d2h(int *cf_markers_local) // Host wrapper for cf_markers_local intKokkosViewHost cf_markers_local_h(cf_markers_local, cf_markers_local_d.extent(0)); + auto exec = PetscGetKokkosExecutionSpace(); + // Now copy device cf_markers_local_d back to host // Device to host so don't need to specify exec space - Kokkos::deep_copy(cf_markers_local_h, cf_markers_local_d); + Kokkos::deep_copy(exec, cf_markers_local_h, cf_markers_local_d); + Kokkos::fence(); // Log copy with petsc size_t bytes = cf_markers_local_d.extent(0) * sizeof(int); PetscCallVoid(PetscLogGpuToCpu(bytes)); @@ -38,9 +41,12 @@ PETSC_INTERN void copy_diag_dom_ratio_d2h(PetscReal *diag_dom_ratio_local) // Host wrapper for diag_dom_ratio_local PetscScalarKokkosViewHost diag_dom_ratio_h(diag_dom_ratio_local, diag_dom_ratio_local_d.extent(0)); + auto exec = PetscGetKokkosExecutionSpace(); + // Copy device diag_dom_ratio_local_d back to host // Device to host so don't need to specify exec space - Kokkos::deep_copy(diag_dom_ratio_h, diag_dom_ratio_local_d); + Kokkos::deep_copy(exec, diag_dom_ratio_h, diag_dom_ratio_local_d); + Kokkos::fence(); // Log copy with petsc size_t bytes = diag_dom_ratio_local_d.extent(0) * sizeof(PetscReal); PetscCallVoid(PetscLogGpuToCpu(bytes)); @@ -114,7 +120,8 @@ PETSC_INTERN void create_cf_is_device_kokkos(Mat *input_mat, const int match_cf, // The last entry in point_offsets_d is the total number of points that match match_cf PetscInt local_rows_row = 0; // Device to host so don't need to specify exec space - Kokkos::deep_copy(local_rows_row, Kokkos::subview(point_offsets_d, local_rows)); + Kokkos::deep_copy(exec, local_rows_row, Kokkos::subview(point_offsets_d, local_rows)); + Kokkos::fence(); // This will be equivalent to is_fine - global_row_start, ie the local indices is_local_d = PetscIntKokkosView("is_local_d", local_rows_row); @@ -183,8 +190,9 @@ PETSC_INTERN void create_cf_is_kokkos(Mat *input_mat, IS *is_fine, IS *is_coarse // Copy over the indices to the host // Device to host so don't need to specify exec space - Kokkos::deep_copy(is_fine_h, is_fine_local_d); - Kokkos::deep_copy(is_coarse_h, is_coarse_local_d); + Kokkos::deep_copy(exec, is_fine_h, is_fine_local_d); + Kokkos::deep_copy(exec, is_coarse_h, is_coarse_local_d); + Kokkos::fence(); // Log copy with petsc size_t bytes_fine = is_fine_local_d.extent(0) * sizeof(PetscInt); size_t bytes_coarse = is_coarse_local_d.extent(0) * sizeof(PetscInt); diff --git a/src/PETSc_Helperk.kokkos.cxx b/src/PETSc_Helperk.kokkos.cxx index 7065d77b..c8772dac 100644 --- a/src/PETSc_Helperk.kokkos.cxx +++ b/src/PETSc_Helperk.kokkos.cxx @@ -139,7 +139,8 @@ PETSC_INTERN void rewrite_j_global_to_local(PetscInt colmap_max_size, PetscInt & { PetscIntKokkosViewHost colmap_output_h = PetscIntKokkosViewHost(*garray_host, col_ao_output); // Device to host so don't need to specify exec space - Kokkos::deep_copy(colmap_output_h, garray_d); + Kokkos::deep_copy(exec, colmap_output_h, garray_d); + Kokkos::fence(); // Log copy with petsc size_t bytes = col_ao_output * sizeof(PetscInt); PetscCallVoid(PetscLogGpuToCpu(bytes)); @@ -855,7 +856,8 @@ PETSC_INTERN void remove_small_from_sparse_kokkos(Mat *input_mat, const PetscRea { PetscIntKokkosViewHost garray_h(garray_host, col_ao_output); // Device to host so don't need to specify exec space - Kokkos::deep_copy(garray_h, garray_d); + Kokkos::deep_copy(exec, garray_h, garray_d); + Kokkos::fence(); size_t bytes = col_ao_output * sizeof(PetscInt); PetscCallVoid(PetscLogGpuToCpu(bytes)); } @@ -1920,7 +1922,8 @@ PETSC_INTERN void MatAXPY_kokkos(Mat *Y, PetscScalar alpha, Mat *X) { PetscIntKokkosViewHost garray_h(garray_host, col_ao_output); // Device to host so don't need to specify exec space - Kokkos::deep_copy(garray_h, garray_d); + Kokkos::deep_copy(exec, garray_h, garray_d); + Kokkos::fence(); size_t bytes = col_ao_output * sizeof(PetscInt); PetscCallVoid(PetscLogGpuToCpu(bytes)); } @@ -2515,7 +2518,8 @@ PETSC_INTERN void MatCreateSubMatrix_kokkos_view(Mat *input_mat, PetscIntKokkosV PetscCallVoid(PetscMalloc1(garray_output_d.extent(0), &garray_host)); PetscIntKokkosViewHost colmap_output_h = PetscIntKokkosViewHost(garray_host, garray_output_d.extent(0)); // Copy the garray output to the host - Kokkos::deep_copy(colmap_output_h, garray_output_d); + Kokkos::deep_copy(exec, colmap_output_h, garray_output_d); + Kokkos::fence(); bytes = colmap_output_h.extent(0) * sizeof(PetscInt); PetscCallVoid(PetscLogGpuToCpu(bytes)); @@ -2531,7 +2535,8 @@ PETSC_INTERN void MatCreateSubMatrix_kokkos_view(Mat *input_mat, PetscIntKokkosV PetscCallVoid(PetscMalloc1(is_col_o_d.extent(0), &is_col_o_host)); PetscIntKokkosViewHost is_col_o_h = PetscIntKokkosViewHost(is_col_o_host, is_col_o_d.extent(0)); // Copy the is_col_o_d output to the host - Kokkos::deep_copy(is_col_o_h, is_col_o_d); + Kokkos::deep_copy(exec, is_col_o_h, is_col_o_d); + Kokkos::fence(); bytes = is_col_o_h.extent(0) * sizeof(PetscInt); PetscCallVoid(PetscLogGpuToCpu(bytes)); // Now create an IS From 51fa9390ec9043d68a8457244febb39147fc5fa9 Mon Sep 17 00:00:00 2001 From: sdargavi Date: Sat, 4 Apr 2026 23:46:04 +0100 Subject: [PATCH 25/60] Add more print staetments --- src/AIR_MG_Setup.F90 | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/src/AIR_MG_Setup.F90 b/src/AIR_MG_Setup.F90 index 244c9788..2c12f1c1 100644 --- a/src/AIR_MG_Setup.F90 +++ b/src/AIR_MG_Setup.F90 @@ -155,7 +155,9 @@ subroutine setup_air_pcmg(amat, pmat, air_data, pcmg_input) ! We already know how many coarse levels we have if we are re-using if (.NOT. air_data%allocated_matrices_A_ff(our_level) .AND. & our_level .ge. air_data%options%auto_truncate_start_level .AND. & - air_data%options%auto_truncate_start_level /= -1) then + air_data%options%auto_truncate_start_level /= -1) then + + print *, "starting auto truncate check on level ", our_level call timer_start(TIMER_ID_AIR_TRUNCATE) @@ -168,6 +170,8 @@ subroutine setup_air_pcmg(amat, pmat, air_data, pcmg_input) proc_stride, & air_data%inv_coarsest_poly_data) + print *, "starting approx inverse ", our_level + ! Start the approximate inverse we'll use on this level call start_approximate_inverse(air_data%coarse_matrix(our_level), & air_data%inv_coarsest_poly_data%inverse_type, & @@ -189,6 +193,8 @@ subroutine setup_air_pcmg(amat, pmat, air_data, pcmg_input) call VecDuplicate(rand_vec, sol_vec, ierr) call VecDuplicate(rand_vec, temp_vec, ierr) + print *, "starting finish approx inverse ", our_level + ! Finish our approximate inverse call finish_approximate_inverse(air_data%coarse_matrix(our_level), & air_data%inv_coarsest_poly_data%inverse_type, & @@ -209,6 +215,8 @@ subroutine setup_air_pcmg(amat, pmat, air_data, pcmg_input) air_data%inv_coarsest_poly_data%inverse_type == PFLAREINV_NEWTON_NO_EXTRA) .AND. & air_data%options%coarsest_matrix_free_polys) then + print *, "starting matvecs residual ", our_level + if (air_data%options%coarsest_diag_scale_polys) then call petsc_matvec_right_scale_poly_newton_residual_mf(air_data%inv_A_ff(our_level), rand_vec, temp_vec) else @@ -225,6 +233,8 @@ subroutine setup_air_pcmg(amat, pmat, air_data, pcmg_input) call VecAXPY(temp_vec, -1d0, rand_vec, ierr) end if + print *, "computing norms ", our_level + ! Get the achieved norm call VecNorm(temp_vec, NORM_2, achieved_rel_tol, ierr) @@ -253,6 +263,8 @@ subroutine setup_air_pcmg(amat, pmat, air_data, pcmg_input) call timer_finish(TIMER_ID_AIR_TRUNCATE) end if + print *, "starting cf splitting ", our_level + ! ~~~~~~~~~~~~ ! Compute the coarsening ! ~~~~~~~~~~~~ From b845b5b8e58e911491ff0dafac6cf42b2ac1c12b Mon Sep 17 00:00:00 2001 From: sdargavi Date: Mon, 6 Apr 2026 16:36:45 +0100 Subject: [PATCH 26/60] Add arnoldi prints --- src/Gmres_Poly.F90 | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/src/Gmres_Poly.F90 b/src/Gmres_Poly.F90 index 1e6e7926..011a279d 100644 --- a/src/Gmres_Poly.F90 +++ b/src/Gmres_Poly.F90 @@ -477,15 +477,21 @@ subroutine calculate_gmres_polynomial_coefficients_arnoldi(matrix, poly_order, c call MPI_Abort(MPI_COMM_WORLD, MPI_ERR_OTHER, errorcode) end if + print *, "about to muller" + ! ~~~~~~~~~~ ! Allocate space and create random numbers ! The first vec has random numbers in it ! ~~~~~~~~~~ call create_temp_space_box_muller(matrix, subspace_size, V_n) + + print *, "done muller" ! Create an extra vector for storage call VecDuplicate(V_n(1), w_j, ierr) + print *, "about to arnoldi" + ! Do the Arnoldi and compute H_n and C_n ! We only compute H_n until we hit a relative residual of 1e-14 against the random rhs ! or we hit the given poly_order @@ -494,6 +500,8 @@ subroutine calculate_gmres_polynomial_coefficients_arnoldi(matrix, poly_order, c call arnoldi(matrix, poly_order, 1d-30, V_n, w_j, beta, H_n, m, C_n, y, rel_tol) if (present(user_rel_tol)) user_rel_tol = rel_tol + print *, "done arnoldi" + ! ~~~~~~~~~~~~~ ! Compute the polynomial coefficients, this is C_n(1:m, 1:m) y ! ~~~~~~~~~~~~~ From f86e5fe8e17c800b5faca22fff56bcc698aa27ba Mon Sep 17 00:00:00 2001 From: sdargavi Date: Mon, 6 Apr 2026 20:26:47 +0100 Subject: [PATCH 27/60] Ensure reads are atomic --- src/PMISR_Modulek.kokkos.cxx | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/src/PMISR_Modulek.kokkos.cxx b/src/PMISR_Modulek.kokkos.cxx index 481c2c77..6ce6344d 100644 --- a/src/PMISR_Modulek.kokkos.cxx +++ b/src/PMISR_Modulek.kokkos.cxx @@ -402,7 +402,7 @@ PETSC_INTERN void pmisr_existing_measure_cf_markers_kokkos(Mat *strength_mat, co // Check if this node was assigned during this top loop. // We read the temporary buffer here so we do not race with the // reduction into cf_markers_d. - if (cf_markers_temp_d(i) == 2) + if (Kokkos::atomic_load(&cf_markers_temp_d(i)) == 2) { const PetscInt ncols_local = device_local_i[i + 1] - device_local_i[i]; @@ -449,7 +449,7 @@ PETSC_INTERN void pmisr_existing_measure_cf_markers_kokkos(Mat *strength_mat, co // In serial there is no reduction, so we can // update cf_markers_d directly as before. - if (cf_markers_d(i) == loops_through) + if (Kokkos::atomic_load(&cf_markers_d(i)) == loops_through) { const PetscInt ncols_local = device_local_i[i + 1] - device_local_i[i]; @@ -997,7 +997,7 @@ PETSC_INTERN void pmisr_existing_measure_implicit_transpose_kokkos(Mat *strength const PetscInt i = t.league_rank(); // Check if this node has been assigned during this top loop - if (cf_markers_d(i) == loops_through) + if (Kokkos::atomic_load(&cf_markers_d(i)) == loops_through) { // Do the strong dependencies and influences PetscInt ncols_local = device_local_i_spst[i + 1] - device_local_i_spst[i]; @@ -1009,7 +1009,7 @@ PETSC_INTERN void pmisr_existing_measure_implicit_transpose_kokkos(Mat *strength // Skip the diagonal - we don't want to mark ourselves as a neighbor // Needs to be atomic as may being set by many threads - if (cf_markers_d(col) != 1 && col != i) + if (Kokkos::atomic_load(&cf_markers_d(col)) != 1 && col != i) { Kokkos::atomic_store(&cf_markers_d(col), 1); } @@ -1133,7 +1133,7 @@ PETSC_INTERN void pmisr_existing_measure_implicit_transpose_kokkos(Mat *strength Kokkos::TeamThreadRange(t, ncols_nonlocal), [&](const PetscInt j) { // Needs to be atomic as may being set by many threads - if (cf_markers_d(device_nonlocal_j_transpose[device_nonlocal_i_transpose[i] + j]) != 1) + if (Kokkos::atomic_load(&cf_markers_d(device_nonlocal_j_transpose[device_nonlocal_i_transpose[i] + j])) != 1) { Kokkos::atomic_store(&cf_markers_d(device_nonlocal_j_transpose[device_nonlocal_i_transpose[i] + j]), 1); } @@ -1154,7 +1154,7 @@ PETSC_INTERN void pmisr_existing_measure_implicit_transpose_kokkos(Mat *strength const PetscInt i = t.league_rank(); // Check if this node has been assigned during this top loop - if (cf_markers_d(i) == loops_through) + if (Kokkos::atomic_load(&cf_markers_d(i)) == loops_through) { // Do the strong dependencies and influences PetscInt ncols_local = device_local_i_spst[i + 1] - device_local_i_spst[i]; @@ -1166,7 +1166,7 @@ PETSC_INTERN void pmisr_existing_measure_implicit_transpose_kokkos(Mat *strength // Skip the diagonal - we don't want to mark ourselves as a neighbor // Needs to be atomic as may being set by many threads - if (cf_markers_d(col) != 1 && col != i) + if (Kokkos::atomic_load(&cf_markers_d(col)) != 1 && col != i) { Kokkos::atomic_store(&cf_markers_d(col), 1); } From 9d6b754781dbd4f6547ebdfcf151ce474ecdaca0 Mon Sep 17 00:00:00 2001 From: sdargavi Date: Mon, 6 Apr 2026 20:43:02 +0100 Subject: [PATCH 28/60] Move scatter --- src/PMISR_Modulek.kokkos.cxx | 52 +++++++++++++++++++----------------- 1 file changed, 27 insertions(+), 25 deletions(-) diff --git a/src/PMISR_Modulek.kokkos.cxx b/src/PMISR_Modulek.kokkos.cxx index 6ce6344d..1d22bd97 100644 --- a/src/PMISR_Modulek.kokkos.cxx +++ b/src/PMISR_Modulek.kokkos.cxx @@ -1017,31 +1017,6 @@ PETSC_INTERN void pmisr_existing_measure_implicit_transpose_kokkos(Mat *strength } }); - // Now for the influences, we need to broadcast the cf_markers so that - // on other ranks we know which nodes have cf_markers_nonlocal_d(i) == loops_through - { - PetscScalarKokkosView root_scalar_d; - PetscCallVoid(VecGetKokkosViewWrite(scatter_root_vec, &root_scalar_d)); - Kokkos::parallel_for( - Kokkos::RangePolicy<>(exec, 0, local_rows), KOKKOS_LAMBDA(PetscInt i) { - root_scalar_d(i) = (PetscScalar)cf_markers_d(i); - }); - PetscCallVoid(VecRestoreKokkosViewWrite(scatter_root_vec, &root_scalar_d)); - } - // Ensure send/receive buffers are stable before Begin. - Kokkos::fence(); - PetscCallVoid(VecScatterBegin(mat_mpi->Mvctx, scatter_root_vec, scatter_leaf_vec, INSERT_VALUES, SCATTER_FORWARD)); - PetscCallVoid(VecScatterEnd(mat_mpi->Mvctx, scatter_root_vec, scatter_leaf_vec, INSERT_VALUES, SCATTER_FORWARD)); - { - ConstPetscScalarKokkosView leaf_scalar_d; - PetscCallVoid(VecGetKokkosView(scatter_leaf_vec, &leaf_scalar_d)); - Kokkos::parallel_for( - Kokkos::RangePolicy<>(exec, 0, cols_ao), KOKKOS_LAMBDA(PetscInt i) { - cf_markers_nonlocal_d(i) = (int)leaf_scalar_d(i); - }); - PetscCallVoid(VecRestoreKokkosView(scatter_leaf_vec, &leaf_scalar_d)); - } - // We use the veto arrays here to do this comms Kokkos::deep_copy(exec, veto_nonlocal_d, false); Kokkos::deep_copy(exec, veto_local_d, false); @@ -1113,6 +1088,33 @@ PETSC_INTERN void pmisr_existing_measure_implicit_transpose_kokkos(Mat *strength } }); + // Now that non-local dependencies are marked, broadcast the cf_markers so that + // on other ranks we know which nodes have cf_markers_nonlocal_d(i) == loops_through + // (i.e. which nonlocal nodes were assigned to the IS this iteration). + // This matches the Fortran ordering: reverse scatter first, then forward scatter. + { + PetscScalarKokkosView root_scalar_d; + PetscCallVoid(VecGetKokkosViewWrite(scatter_root_vec, &root_scalar_d)); + Kokkos::parallel_for( + Kokkos::RangePolicy<>(exec, 0, local_rows), KOKKOS_LAMBDA(PetscInt i) { + root_scalar_d(i) = (PetscScalar)cf_markers_d(i); + }); + PetscCallVoid(VecRestoreKokkosViewWrite(scatter_root_vec, &root_scalar_d)); + } + // Ensure send/receive buffers are stable before Begin. + Kokkos::fence(); + PetscCallVoid(VecScatterBegin(mat_mpi->Mvctx, scatter_root_vec, scatter_leaf_vec, INSERT_VALUES, SCATTER_FORWARD)); + PetscCallVoid(VecScatterEnd(mat_mpi->Mvctx, scatter_root_vec, scatter_leaf_vec, INSERT_VALUES, SCATTER_FORWARD)); + { + ConstPetscScalarKokkosView leaf_scalar_d; + PetscCallVoid(VecGetKokkosView(scatter_leaf_vec, &leaf_scalar_d)); + Kokkos::parallel_for( + Kokkos::RangePolicy<>(exec, 0, cols_ao), KOKKOS_LAMBDA(PetscInt i) { + cf_markers_nonlocal_d(i) = (int)leaf_scalar_d(i); + }); + PetscCallVoid(VecRestoreKokkosView(scatter_leaf_vec, &leaf_scalar_d)); + } + // And now we have the information we need to set any of the non-local influences if (mat_nonlocal_transpose != NULL) { From d557cc6eceeaf4828eb45686427dac01447ca78e Mon Sep 17 00:00:00 2001 From: sdargavi Date: Mon, 6 Apr 2026 21:11:40 +0100 Subject: [PATCH 29/60] Bounds checking on cf splitting --- include/kokkos_helper.hpp | 99 ++++++++++++++++++++++++++++++++++++ src/DDC_Modulek.kokkos.cxx | 6 +++ src/Device_Datak.kokkos.cxx | 7 +++ src/PMISR_Modulek.kokkos.cxx | 3 ++ 4 files changed, 115 insertions(+) diff --git a/include/kokkos_helper.hpp b/include/kokkos_helper.hpp index d34f8b8f..9c173b40 100644 --- a/include/kokkos_helper.hpp +++ b/include/kokkos_helper.hpp @@ -164,4 +164,103 @@ PetscInt binary_search_sorted(const ViewType &sorted_view, const PetscInt size, return -1; } +// Check that every entry in cf_markers_d is either -1 (F) or 1 (C). +// Calls MPI_Abort if any local point is not marked. +inline void check_cf_markers_all_marked_kokkos( + const intKokkosView &cf_markers_d, + const PetscInt local_rows, + MPI_Comm MPI_COMM_MATRIX) +{ + auto exec = PetscGetKokkosExecutionSpace(); + PetscInt bad_count = 0; + Kokkos::parallel_reduce( + "check_cf_markers", + Kokkos::RangePolicy<>(exec, 0, local_rows), + KOKKOS_LAMBDA(const PetscInt i, PetscInt &count) { + if (cf_markers_d(i) != -1 && cf_markers_d(i) != 1) count++; + }, bad_count); + Kokkos::fence(); + int rank = 0; + MPI_Comm_rank(MPI_COMM_MATRIX, &rank); + if (bad_count > 0) { + fprintf(stderr, + "[PFLARE kokkos rank=%d] ERROR check_cf_markers_all_marked_kokkos: " + "%d / %d local points are NOT marked F or C\n", + rank, (int)bad_count, (int)local_rows); + fflush(stderr); + MPI_Abort(MPI_COMM_MATRIX, 1); + } else { + fprintf(stderr, + "[PFLARE kokkos rank=%d] check_cf_markers_all_marked_kokkos: " + "all %d local points marked F or C OK\n", + rank, (int)local_rows); + fflush(stderr); + } +} + +// Check that is_fine_local_d and is_coarse_local_d together cover every local +// point [0, local_rows-1] exactly once (no missing, no duplicates). +// Call before global-index conversion (entries are local offsets [0, local_rows-1]). +// Calls MPI_Abort if any point is missing or duplicated. +inline void check_cf_is_all_local_kokkos( + const PetscIntKokkosView &is_fine_local_d, + const PetscIntKokkosView &is_coarse_local_d, + const PetscInt local_rows, + MPI_Comm MPI_COMM_MATRIX) +{ + auto exec = PetscGetKokkosExecutionSpace(); + int rank = 0; + MPI_Comm_rank(MPI_COMM_MATRIX, &rank); + + // Allocate hit-count array, initialised to 0 + intKokkosView hit_count("hit_count", local_rows); + Kokkos::deep_copy(exec, hit_count, 0); + + // Mark each fine index (atomic to catch duplicates within the fine set) + Kokkos::parallel_for( + "check_cf_is_mark_fine", + Kokkos::RangePolicy<>(exec, 0, (PetscInt)is_fine_local_d.extent(0)), + KOKKOS_LAMBDA(const PetscInt i) { + const PetscInt idx = is_fine_local_d(i); + if (idx >= 0 && idx < local_rows) + Kokkos::atomic_add(&hit_count(idx), 1); + }); + + // Mark each coarse index + Kokkos::parallel_for( + "check_cf_is_mark_coarse", + Kokkos::RangePolicy<>(exec, 0, (PetscInt)is_coarse_local_d.extent(0)), + KOKKOS_LAMBDA(const PetscInt i) { + const PetscInt idx = is_coarse_local_d(i); + if (idx >= 0 && idx < local_rows) + Kokkos::atomic_add(&hit_count(idx), 1); + }); + + // Count any point not hit exactly once + PetscInt bad_count = 0; + Kokkos::parallel_reduce( + "check_cf_is_count_bad", + Kokkos::RangePolicy<>(exec, 0, local_rows), + KOKKOS_LAMBDA(const PetscInt i, PetscInt &count) { + if (hit_count(i) != 1) count++; + }, bad_count); + + Kokkos::fence(); + + if (bad_count > 0) { + fprintf(stderr, + "[PFLARE kokkos rank=%d] ERROR check_cf_is_all_local_kokkos: " + "%d / %d local points are not covered exactly once by fine+coarse IS\n", + rank, (int)bad_count, (int)local_rows); + fflush(stderr); + MPI_Abort(MPI_COMM_MATRIX, 1); + } else { + fprintf(stderr, + "[PFLARE kokkos rank=%d] check_cf_is_all_local_kokkos: " + "fine=%d coarse=%d, all %d local points covered exactly once OK\n", + rank, (int)is_fine_local_d.extent(0), (int)is_coarse_local_d.extent(0), (int)local_rows); + fflush(stderr); + } +} + #endif \ No newline at end of file diff --git a/src/DDC_Modulek.kokkos.cxx b/src/DDC_Modulek.kokkos.cxx index d0495156..13f782dd 100644 --- a/src/DDC_Modulek.kokkos.cxx +++ b/src/DDC_Modulek.kokkos.cxx @@ -23,6 +23,8 @@ PETSC_INTERN void ddc_kokkos(Mat *input_mat, const PetscReal fraction_swap, cons PetscIntKokkosView is_fine_local_d; // Equivalent to calling MatSeqAIJKokkosSyncDevice which is petsc intern mat_sync(input_mat); + MPI_Comm MPI_COMM_MATRIX; + PetscCallVoid(PetscObjectGetComm((PetscObject)*input_mat, &MPI_COMM_MATRIX)); const int match_cf = -1; // F_POINT == -1 create_cf_is_device_kokkos(input_mat, match_cf, is_fine_local_d); @@ -106,6 +108,8 @@ PETSC_INTERN void ddc_kokkos(Mat *input_mat, const PetscReal fraction_swap, cons // pmis_int=0 means PMISR, zero_measure_c_point_int=0 pmisr_existing_measure_implicit_transpose_kokkos(aff, -1, 0, measure_d, cf_markers_aff_d, 0); + check_cf_markers_all_marked_kokkos(cf_markers_aff_d, cf_markers_aff_d.extent(0), MPI_COMM_MATRIX); + // Swap F-tagged points back into cf_markers_d Kokkos::parallel_for( Kokkos::RangePolicy<>(exec, 0, local_rows_aff), KOKKOS_LAMBDA(PetscInt i) { @@ -115,6 +119,8 @@ PETSC_INTERN void ddc_kokkos(Mat *input_mat, const PetscReal fraction_swap, cons } }); Kokkos::fence(); + + check_cf_markers_all_marked_kokkos(cf_markers_d, cf_markers_d.extent(0), MPI_COMM_MATRIX); } return; } diff --git a/src/Device_Datak.kokkos.cxx b/src/Device_Datak.kokkos.cxx index 0e1e3b83..811117b6 100644 --- a/src/Device_Datak.kokkos.cxx +++ b/src/Device_Datak.kokkos.cxx @@ -151,6 +151,9 @@ PETSC_INTERN void create_cf_is_kokkos(Mat *input_mat, IS *is_fine, IS *is_coarse MPI_Comm MPI_COMM_MATRIX; PetscCallVoid(PetscObjectGetComm((PetscObject)*input_mat, &MPI_COMM_MATRIX)); + PetscInt local_rows_check, local_cols_check; + PetscCallVoid(MatGetLocalSize(*input_mat, &local_rows_check, &local_cols_check)); + // Create the local f point indices const int match_fine = -1; // F_POINT == -1 create_cf_is_device_kokkos(input_mat, match_fine, is_fine_local_d); @@ -159,6 +162,10 @@ PETSC_INTERN void create_cf_is_kokkos(Mat *input_mat, IS *is_fine, IS *is_coarse const int match_coarse = 1; // C_POINT == 1 create_cf_is_device_kokkos(input_mat, match_coarse, is_coarse_local_d); + // Sanity check: fine + coarse must cover every local point exactly once + // (check before global-index conversion while entries are still [0, local_rows-1]) + check_cf_is_all_local_kokkos(is_fine_local_d, is_coarse_local_d, local_rows_check, MPI_COMM_MATRIX); + // Now convert them back to global indices PetscInt global_row_start, global_row_end_plus_one; PetscCallVoid(MatGetOwnershipRange(*input_mat, &global_row_start, &global_row_end_plus_one)); diff --git a/src/PMISR_Modulek.kokkos.cxx b/src/PMISR_Modulek.kokkos.cxx index 1d22bd97..1760546c 100644 --- a/src/PMISR_Modulek.kokkos.cxx +++ b/src/PMISR_Modulek.kokkos.cxx @@ -1326,6 +1326,9 @@ PETSC_INTERN void pmisr_kokkos(Mat *strength_mat, const int max_luby_steps, cons // Call the existing measure cf markers function pmisr_existing_measure_cf_markers_kokkos(strength_mat, max_luby_steps, pmis_int, measure_local_d, cf_markers_d, zero_measure_c_point_int); + // Sanity check: every local point must be marked F(-1) or C(1) + check_cf_markers_all_marked_kokkos(cf_markers_d, local_rows, MPI_COMM_MATRIX); + // If PMIS then we swap the CF markers from PMISR if (pmis_int) { Kokkos::parallel_for( From 8a942f2e08b88e110b842891fbd1612c00422f42 Mon Sep 17 00:00:00 2001 From: sdargavi Date: Tue, 7 Apr 2026 03:17:12 +0100 Subject: [PATCH 30/60] Ensure fences before vecdestroy --- src/MatDiagDomk.kokkos.cxx | 4 +++- src/PMISR_Modulek.kokkos.cxx | 6 ++++++ 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/src/MatDiagDomk.kokkos.cxx b/src/MatDiagDomk.kokkos.cxx index b290e6ee..fd5f4247 100644 --- a/src/MatDiagDomk.kokkos.cxx +++ b/src/MatDiagDomk.kokkos.cxx @@ -174,9 +174,11 @@ PETSC_INTERN void MatDiagDomRatio_kokkos(Mat *input_mat, PetscReal *max_dd_ratio }); PetscCallVoid(VecRestoreKokkosView(scatter_leaf_vec, &lvec_scalar_d)); } + // Ensure the async parallel_for reading scatter_leaf_vec's device memory has completed + // before VecDestroy frees it. + Kokkos::fence(); PetscCallVoid(VecDestroy(&scatter_root_vec)); PetscCallVoid(VecDestroy(&scatter_leaf_vec)); - Kokkos::fence(); } // ~~~~~~~~~~~~~~~ diff --git a/src/PMISR_Modulek.kokkos.cxx b/src/PMISR_Modulek.kokkos.cxx index 1760546c..9c57682d 100644 --- a/src/PMISR_Modulek.kokkos.cxx +++ b/src/PMISR_Modulek.kokkos.cxx @@ -99,6 +99,9 @@ PETSC_INTERN void pmisr_existing_measure_cf_markers_kokkos(Mat *strength_mat, co Kokkos::deep_copy(exec, measure_nonlocal_d, leaf_scalar_d); PetscCallVoid(VecRestoreKokkosView(measure_leaf_vec, &leaf_scalar_d)); } + // Ensure the async deep_copy reading measure_leaf_vec's device memory has completed + // before VecDestroy frees it. + Kokkos::fence(); PetscCallVoid(VecDestroy(&measure_root_vec)); PetscCallVoid(VecDestroy(&measure_leaf_vec)); } @@ -659,6 +662,9 @@ PETSC_INTERN void pmisr_existing_measure_implicit_transpose_kokkos(Mat *strength Kokkos::deep_copy(exec, measure_nonlocal_d, lvec_scalar_d); PetscCallVoid(VecRestoreKokkosView(measure_leaf_vec, &lvec_scalar_d)); } + // Ensure the async deep_copy reading measure_leaf_vec's device memory has completed + // before VecDestroy frees it. + Kokkos::fence(); PetscCallVoid(VecDestroy(&measure_root_vec)); PetscCallVoid(VecDestroy(&measure_leaf_vec)); } From 8685b24412900c4374ca0a6f68937a787577ae4a Mon Sep 17 00:00:00 2001 From: sdargavi Date: Tue, 7 Apr 2026 04:54:46 +0100 Subject: [PATCH 31/60] Remove scoping --- src/PETSc_Helperk.kokkos.cxx | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/src/PETSc_Helperk.kokkos.cxx b/src/PETSc_Helperk.kokkos.cxx index c8772dac..4e6eb576 100644 --- a/src/PETSc_Helperk.kokkos.cxx +++ b/src/PETSc_Helperk.kokkos.cxx @@ -2364,7 +2364,7 @@ PETSC_INTERN void MatCreateSubMatrix_kokkos_view(Mat *input_mat, PetscIntKokkosV PetscCallVoid(VecDuplicate(x_vec, &cmap_vec)); // Fill x_vec on device: x[is_col(i)] = is_col(i), rest = -1 - { + PetscScalarKokkosView x_scalar_d; PetscCallVoid(VecGetKokkosViewWrite(x_vec, &x_scalar_d)); Kokkos::deep_copy(exec, x_scalar_d, -1.0); @@ -2373,7 +2373,7 @@ PETSC_INTERN void MatCreateSubMatrix_kokkos_view(Mat *input_mat, PetscIntKokkosV x_scalar_d(is_col_d_d(i)) = (PetscScalar)is_col_d_d(i); }); PetscCallVoid(VecRestoreKokkosViewWrite(x_vec, &x_scalar_d)); - } + /* (2) Scatter x and cmap using Mvctx to get their off-process portions */ // Keep at most one active communication on Mvctx at a time. @@ -2387,7 +2387,7 @@ PETSC_INTERN void MatCreateSubMatrix_kokkos_view(Mat *input_mat, PetscIntKokkosV PetscCallVoid(VecScatterEnd(mat_mpi->Mvctx, x_vec, x_leaf_vec, INSERT_VALUES, SCATTER_FORWARD)); // Fill cmap_vec on device: cmap[is_col(i)] = i + isstart, rest = -1 - { + PetscScalarKokkosView cmap_scalar_d; PetscCallVoid(VecGetKokkosViewWrite(cmap_vec, &cmap_scalar_d)); Kokkos::deep_copy(exec, cmap_scalar_d, -1.0); @@ -2396,7 +2396,7 @@ PETSC_INTERN void MatCreateSubMatrix_kokkos_view(Mat *input_mat, PetscIntKokkosV cmap_scalar_d(is_col_d_d(i)) = (PetscScalar)(i + isstart); }); PetscCallVoid(VecRestoreKokkosViewWrite(cmap_vec, &cmap_scalar_d)); - } + Vec lcmap_vec; PetscCallVoid(VecDuplicate(mat_mpi->lvec, &lcmap_vec)); @@ -2415,8 +2415,8 @@ PETSC_INTERN void MatCreateSubMatrix_kokkos_view(Mat *input_mat, PetscIntKokkosV // cmap scatter completed: lcmap_vec is now safe to read. PetscCallVoid(VecScatterEnd(mat_mpi->Mvctx, cmap_vec, lcmap_vec, INSERT_VALUES, SCATTER_FORWARD)); - if (cols_ao > 0) - { + //if (cols_ao > 0) + //{ ConstPetscScalarKokkosView lvec_scalar_d; PetscCallVoid(VecGetKokkosView(x_leaf_vec, &lvec_scalar_d)); @@ -2433,7 +2433,7 @@ PETSC_INTERN void MatCreateSubMatrix_kokkos_view(Mat *input_mat, PetscIntKokkosV ); PetscCallVoid(VecRestoreKokkosView(x_leaf_vec, &lvec_scalar_d)); - } + //} // Need to do an exclusive scan on is_col_o_match_d to get the new local indices // Have to remember to go up to cols_ao+1 @@ -2453,7 +2453,7 @@ PETSC_INTERN void MatCreateSubMatrix_kokkos_view(Mat *input_mat, PetscIntKokkosV garray_output_d = PetscIntKokkosView("garray_output_d", col_ao_output); // Loop over all the cols in the input matrix - { + //{ ConstPetscScalarKokkosView lcmap_scalar_d; PetscCallVoid(VecGetKokkosView(lcmap_vec, &lcmap_scalar_d)); @@ -2472,7 +2472,7 @@ PETSC_INTERN void MatCreateSubMatrix_kokkos_view(Mat *input_mat, PetscIntKokkosV Kokkos::fence(); PetscCallVoid(VecRestoreKokkosView(lcmap_vec, &lcmap_scalar_d)); - } + //} // Cleanup Vecs PetscCallVoid(VecDestroy(&x_vec)); From b6ccbe02acd91f87971f3f09ba202c73ab610f5d Mon Sep 17 00:00:00 2001 From: sdargavi Date: Tue, 7 Apr 2026 13:59:50 +0100 Subject: [PATCH 32/60] More text output --- src/PETSc_Helperk.kokkos.cxx | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/src/PETSc_Helperk.kokkos.cxx b/src/PETSc_Helperk.kokkos.cxx index 4e6eb576..a70a703e 100644 --- a/src/PETSc_Helperk.kokkos.cxx +++ b/src/PETSc_Helperk.kokkos.cxx @@ -2358,6 +2358,8 @@ PETSC_INTERN void MatCreateSubMatrix_kokkos_view(Mat *input_mat, PetscIntKokkosV // Uses VecScatter with PetscScalar Vecs (matching PETSc's own pattern) // instead of direct PetscSFBcast with MPIU_INT on temporary views. + std::cerr << "one " << std::endl; + /* (1) iscol is a sub-column vector of mat, pad it with '-1.' to form a full vector x */ Vec x_vec, cmap_vec; PetscCallVoid(MatCreateVecs(*input_mat, &x_vec, NULL)); @@ -2375,6 +2377,8 @@ PETSC_INTERN void MatCreateSubMatrix_kokkos_view(Mat *input_mat, PetscIntKokkosV PetscCallVoid(VecRestoreKokkosViewWrite(x_vec, &x_scalar_d)); + std::cerr << "two " << std::endl; + /* (2) Scatter x and cmap using Mvctx to get their off-process portions */ // Keep at most one active communication on Mvctx at a time. // While Begin/End is in flight, do not touch the corresponding send/recv buffers. @@ -2397,6 +2401,7 @@ PETSC_INTERN void MatCreateSubMatrix_kokkos_view(Mat *input_mat, PetscIntKokkosV }); PetscCallVoid(VecRestoreKokkosViewWrite(cmap_vec, &cmap_scalar_d)); + std::cerr << "three " << std::endl; Vec lcmap_vec; PetscCallVoid(VecDuplicate(mat_mpi->lvec, &lcmap_vec)); @@ -2435,6 +2440,9 @@ PETSC_INTERN void MatCreateSubMatrix_kokkos_view(Mat *input_mat, PetscIntKokkosV PetscCallVoid(VecRestoreKokkosView(x_leaf_vec, &lvec_scalar_d)); //} + std::cerr << "four " << std::endl; + + // Need to do an exclusive scan on is_col_o_match_d to get the new local indices // Have to remember to go up to cols_ao+1 Kokkos::parallel_scan(Kokkos::RangePolicy<>(exec, 0, cols_ao+1), KOKKOS_LAMBDA(const PetscInt i, PetscInt& partial_sum, const bool is_final) { @@ -2474,6 +2482,9 @@ PETSC_INTERN void MatCreateSubMatrix_kokkos_view(Mat *input_mat, PetscIntKokkosV PetscCallVoid(VecRestoreKokkosView(lcmap_vec, &lcmap_scalar_d)); //} + std::cerr << "five " << std::endl; + + // Cleanup Vecs PetscCallVoid(VecDestroy(&x_vec)); PetscCallVoid(VecDestroy(&x_leaf_vec)); @@ -2513,6 +2524,8 @@ PETSC_INTERN void MatCreateSubMatrix_kokkos_view(Mat *input_mat, PetscIntKokkosV // If it's our first time through we have to create our output matrix if (!reuse_int) { + std::cerr << "six " << std::endl; + // Copy the garray output to the host PetscInt *garray_host = NULL; PetscCallVoid(PetscMalloc1(garray_output_d.extent(0), &garray_host)); @@ -2522,10 +2535,15 @@ PETSC_INTERN void MatCreateSubMatrix_kokkos_view(Mat *input_mat, PetscIntKokkosV Kokkos::fence(); bytes = colmap_output_h.extent(0) * sizeof(PetscInt); PetscCallVoid(PetscLogGpuToCpu(bytes)); + + std::cerr << "seven " << std::endl; + // We can now create our MPI matrix PetscCallVoid(MatCreateMPIAIJWithSeqAIJ(MPI_COMM_MATRIX, global_rows_row, global_cols_col, output_mat_local, output_mat_nonlocal, garray_host, output_mat)); + std::cerr << "eight " << std::endl; + // ~~~~~~~~~~~~~~ // If this is the first time through, we need to store the iscol_o in the output_mat // We don't store the is_row_d_d or is_col_d_d like the host version does as they're super cheap to rebuild @@ -2546,6 +2564,9 @@ PETSC_INTERN void MatCreateSubMatrix_kokkos_view(Mat *input_mat, PetscIntKokkosV PetscCallVoid(PetscObjectCompose((PetscObject)(*output_mat), "iscol_o", (PetscObject)iscol_o)); // The ref counter is incremented by the compose PetscCallVoid(ISDestroy(&iscol_o)); + + std::cerr << "nine " << std::endl; + } } else From 32dffcb71a014b2ce37688754a746abd8fcb86f0 Mon Sep 17 00:00:00 2001 From: sdargavi Date: Tue, 7 Apr 2026 14:07:47 +0100 Subject: [PATCH 33/60] Extra scatter print --- src/PETSc_Helperk.kokkos.cxx | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/PETSc_Helperk.kokkos.cxx b/src/PETSc_Helperk.kokkos.cxx index a70a703e..845ac766 100644 --- a/src/PETSc_Helperk.kokkos.cxx +++ b/src/PETSc_Helperk.kokkos.cxx @@ -2386,10 +2386,14 @@ PETSC_INTERN void MatCreateSubMatrix_kokkos_view(Mat *input_mat, PetscIntKokkosV PetscCallVoid(VecDuplicate(mat_mpi->lvec, &x_leaf_vec)); // Ensure send/receive buffers are stable before Begin. Kokkos::fence(); + std::cerr << "two a " << std::endl; + PetscCallVoid(VecScatterBegin(mat_mpi->Mvctx, x_vec, x_leaf_vec, INSERT_VALUES, SCATTER_FORWARD)); // x scatter completed: x_leaf_vec is now safe to read. PetscCallVoid(VecScatterEnd(mat_mpi->Mvctx, x_vec, x_leaf_vec, INSERT_VALUES, SCATTER_FORWARD)); + std::cerr << "two b" << std::endl; + // Fill cmap_vec on device: cmap[is_col(i)] = i + isstart, rest = -1 PetscScalarKokkosView cmap_scalar_d; From 67f48f5f6058d9ba5572c4023a04b55fc050dfc8 Mon Sep 17 00:00:00 2001 From: sdargavi Date: Tue, 7 Apr 2026 14:20:02 +0100 Subject: [PATCH 34/60] Extract acf afc print --- src/AIR_Operators_Setup.F90 | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/AIR_Operators_Setup.F90 b/src/AIR_Operators_Setup.F90 index 25cff957..97ed5b38 100644 --- a/src/AIR_Operators_Setup.F90 +++ b/src/AIR_Operators_Setup.F90 @@ -191,7 +191,9 @@ subroutine get_submatrices_start_poly_coeff_comms(input_mat, our_level, air_data ! ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ! Pull out the rest of the sub-matrices ! ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - call timer_start(TIMER_ID_AIR_EXTRACT) + call timer_start(TIMER_ID_AIR_EXTRACT) + + print *, "extract afc acf start" ! Only reuse when coarse matrix structure is stable (amount>=2 stores MAT_RAP_DROP) if (air_data%allocated_matrices_A_ff(our_level) .AND. & @@ -216,6 +218,8 @@ subroutine get_submatrices_start_poly_coeff_comms(input_mat, our_level, air_data our_level = our_level, is_row_fine = .FALSE., is_col_fine = .TRUE.) end if + print *, "extract afc acf done" + call timer_finish(TIMER_ID_AIR_EXTRACT) ! ~~~~~~~~~~~~~~ From 6d4ea508271b1e09ece18f3c5a98ada962fdfb9a Mon Sep 17 00:00:00 2001 From: sdargavi Date: Tue, 7 Apr 2026 14:26:43 +0100 Subject: [PATCH 35/60] Replace acf matcreatesubmatrix --- src/AIR_Operators_Setup.F90 | 44 ++++++++++++++++++++++++++----------- 1 file changed, 31 insertions(+), 13 deletions(-) diff --git a/src/AIR_Operators_Setup.F90 b/src/AIR_Operators_Setup.F90 index 97ed5b38..efb54c95 100644 --- a/src/AIR_Operators_Setup.F90 +++ b/src/AIR_Operators_Setup.F90 @@ -195,28 +195,46 @@ subroutine get_submatrices_start_poly_coeff_comms(input_mat, our_level, air_data print *, "extract afc acf start" + ! ! Only reuse when coarse matrix structure is stable (amount>=2 stores MAT_RAP_DROP) + ! if (air_data%allocated_matrices_A_ff(our_level) .AND. & + ! air_data%options%reuse_sparsity .AND. & + ! REUSE_MAT_ACTIVE(MAT_RAP_DROP, air_data%options%reuse_amount)) then + ! call MatCreateSubMatrixWrapper(input_mat, & + ! air_data%IS_fine_index(our_level), air_data%IS_coarse_index(our_level), MAT_REUSE_MATRIX, & + ! air_data%A_fc(our_level), & + ! our_level = our_level, is_row_fine = .TRUE., is_col_fine = .FALSE.) + ! call MatCreateSubMatrixWrapper(input_mat, & + ! air_data%IS_coarse_index(our_level), air_data%IS_fine_index(our_level), MAT_REUSE_MATRIX, & + ! air_data%A_cf(our_level), & + ! our_level = our_level, is_row_fine = .FALSE., is_col_fine = .TRUE.) + ! else + ! call MatCreateSubMatrixWrapper(input_mat, & + ! air_data%IS_fine_index(our_level), air_data%IS_coarse_index(our_level), MAT_INITIAL_MATRIX, & + ! air_data%A_fc(our_level), & + ! our_level = our_level, is_row_fine = .TRUE., is_col_fine = .FALSE.) + ! call MatCreateSubMatrixWrapper(input_mat, & + ! air_data%IS_coarse_index(our_level), air_data%IS_fine_index(our_level), MAT_INITIAL_MATRIX, & + ! air_data%A_cf(our_level), & + ! our_level = our_level, is_row_fine = .FALSE., is_col_fine = .TRUE.) + ! end if ! Only reuse when coarse matrix structure is stable (amount>=2 stores MAT_RAP_DROP) if (air_data%allocated_matrices_A_ff(our_level) .AND. & air_data%options%reuse_sparsity .AND. & REUSE_MAT_ACTIVE(MAT_RAP_DROP, air_data%options%reuse_amount)) then - call MatCreateSubMatrixWrapper(input_mat, & + call MatCreateSubMatrix(input_mat, & air_data%IS_fine_index(our_level), air_data%IS_coarse_index(our_level), MAT_REUSE_MATRIX, & - air_data%A_fc(our_level), & - our_level = our_level, is_row_fine = .TRUE., is_col_fine = .FALSE.) - call MatCreateSubMatrixWrapper(input_mat, & + air_data%A_fc(our_level), ierr) + call MatCreateSubMatrix(input_mat, & air_data%IS_coarse_index(our_level), air_data%IS_fine_index(our_level), MAT_REUSE_MATRIX, & - air_data%A_cf(our_level), & - our_level = our_level, is_row_fine = .FALSE., is_col_fine = .TRUE.) + air_data%A_cf(our_level),ierr) else - call MatCreateSubMatrixWrapper(input_mat, & + call MatCreateSubMatrix(input_mat, & air_data%IS_fine_index(our_level), air_data%IS_coarse_index(our_level), MAT_INITIAL_MATRIX, & - air_data%A_fc(our_level), & - our_level = our_level, is_row_fine = .TRUE., is_col_fine = .FALSE.) - call MatCreateSubMatrixWrapper(input_mat, & + air_data%A_fc(our_level), ierr) + call MatCreateSubMatrix(input_mat, & air_data%IS_coarse_index(our_level), air_data%IS_fine_index(our_level), MAT_INITIAL_MATRIX, & - air_data%A_cf(our_level), & - our_level = our_level, is_row_fine = .FALSE., is_col_fine = .TRUE.) - end if + air_data%A_cf(our_level), ierr) + end if print *, "extract afc acf done" From cd44845841ab0d4f7a66fe2e925d69a2a8859079 Mon Sep 17 00:00:00 2001 From: sdargavi Date: Tue, 7 Apr 2026 14:34:28 +0100 Subject: [PATCH 36/60] Disable kokkos MatCreateSubMatrix --- src/AIR_Operators_Setup.F90 | 62 ++++++++-------- src/PETSc_Helper.F90 | 142 ++++++++++++++++++------------------ 2 files changed, 102 insertions(+), 102 deletions(-) diff --git a/src/AIR_Operators_Setup.F90 b/src/AIR_Operators_Setup.F90 index efb54c95..6a724012 100644 --- a/src/AIR_Operators_Setup.F90 +++ b/src/AIR_Operators_Setup.F90 @@ -195,46 +195,46 @@ subroutine get_submatrices_start_poly_coeff_comms(input_mat, our_level, air_data print *, "extract afc acf start" - ! ! Only reuse when coarse matrix structure is stable (amount>=2 stores MAT_RAP_DROP) - ! if (air_data%allocated_matrices_A_ff(our_level) .AND. & - ! air_data%options%reuse_sparsity .AND. & - ! REUSE_MAT_ACTIVE(MAT_RAP_DROP, air_data%options%reuse_amount)) then - ! call MatCreateSubMatrixWrapper(input_mat, & - ! air_data%IS_fine_index(our_level), air_data%IS_coarse_index(our_level), MAT_REUSE_MATRIX, & - ! air_data%A_fc(our_level), & - ! our_level = our_level, is_row_fine = .TRUE., is_col_fine = .FALSE.) - ! call MatCreateSubMatrixWrapper(input_mat, & - ! air_data%IS_coarse_index(our_level), air_data%IS_fine_index(our_level), MAT_REUSE_MATRIX, & - ! air_data%A_cf(our_level), & - ! our_level = our_level, is_row_fine = .FALSE., is_col_fine = .TRUE.) - ! else - ! call MatCreateSubMatrixWrapper(input_mat, & - ! air_data%IS_fine_index(our_level), air_data%IS_coarse_index(our_level), MAT_INITIAL_MATRIX, & - ! air_data%A_fc(our_level), & - ! our_level = our_level, is_row_fine = .TRUE., is_col_fine = .FALSE.) - ! call MatCreateSubMatrixWrapper(input_mat, & - ! air_data%IS_coarse_index(our_level), air_data%IS_fine_index(our_level), MAT_INITIAL_MATRIX, & - ! air_data%A_cf(our_level), & - ! our_level = our_level, is_row_fine = .FALSE., is_col_fine = .TRUE.) - ! end if ! Only reuse when coarse matrix structure is stable (amount>=2 stores MAT_RAP_DROP) if (air_data%allocated_matrices_A_ff(our_level) .AND. & air_data%options%reuse_sparsity .AND. & REUSE_MAT_ACTIVE(MAT_RAP_DROP, air_data%options%reuse_amount)) then - call MatCreateSubMatrix(input_mat, & + call MatCreateSubMatrixWrapper(input_mat, & air_data%IS_fine_index(our_level), air_data%IS_coarse_index(our_level), MAT_REUSE_MATRIX, & - air_data%A_fc(our_level), ierr) - call MatCreateSubMatrix(input_mat, & + air_data%A_fc(our_level), & + our_level = our_level, is_row_fine = .TRUE., is_col_fine = .FALSE.) + call MatCreateSubMatrixWrapper(input_mat, & air_data%IS_coarse_index(our_level), air_data%IS_fine_index(our_level), MAT_REUSE_MATRIX, & - air_data%A_cf(our_level),ierr) + air_data%A_cf(our_level), & + our_level = our_level, is_row_fine = .FALSE., is_col_fine = .TRUE.) else - call MatCreateSubMatrix(input_mat, & + call MatCreateSubMatrixWrapper(input_mat, & air_data%IS_fine_index(our_level), air_data%IS_coarse_index(our_level), MAT_INITIAL_MATRIX, & - air_data%A_fc(our_level), ierr) - call MatCreateSubMatrix(input_mat, & + air_data%A_fc(our_level), & + our_level = our_level, is_row_fine = .TRUE., is_col_fine = .FALSE.) + call MatCreateSubMatrixWrapper(input_mat, & air_data%IS_coarse_index(our_level), air_data%IS_fine_index(our_level), MAT_INITIAL_MATRIX, & - air_data%A_cf(our_level), ierr) - end if + air_data%A_cf(our_level), & + our_level = our_level, is_row_fine = .FALSE., is_col_fine = .TRUE.) + end if + ! Only reuse when coarse matrix structure is stable (amount>=2 stores MAT_RAP_DROP) + ! if (air_data%allocated_matrices_A_ff(our_level) .AND. & + ! air_data%options%reuse_sparsity .AND. & + ! REUSE_MAT_ACTIVE(MAT_RAP_DROP, air_data%options%reuse_amount)) then + ! call MatCreateSubMatrix(input_mat, & + ! air_data%IS_fine_index(our_level), air_data%IS_coarse_index(our_level), MAT_REUSE_MATRIX, & + ! air_data%A_fc(our_level), ierr) + ! call MatCreateSubMatrix(input_mat, & + ! air_data%IS_coarse_index(our_level), air_data%IS_fine_index(our_level), MAT_REUSE_MATRIX, & + ! air_data%A_cf(our_level),ierr) + ! else + ! call MatCreateSubMatrix(input_mat, & + ! air_data%IS_fine_index(our_level), air_data%IS_coarse_index(our_level), MAT_INITIAL_MATRIX, & + ! air_data%A_fc(our_level), ierr) + ! call MatCreateSubMatrix(input_mat, & + ! air_data%IS_coarse_index(our_level), air_data%IS_fine_index(our_level), MAT_INITIAL_MATRIX, & + ! air_data%A_cf(our_level), ierr) + ! end if print *, "extract afc acf done" diff --git a/src/PETSc_Helper.F90 b/src/PETSc_Helper.F90 index fd34786f..6c6bdb01 100644 --- a/src/PETSc_Helper.F90 +++ b/src/PETSc_Helper.F90 @@ -1098,79 +1098,79 @@ subroutine MatCreateSubMatrixWrapper(input_mat, is_row, is_col, & #endif ! ~~~~~~~~~~ -#if defined(PETSC_HAVE_KOKKOS) - - call MatGetType(input_mat, mat_type, ierr) - call PetscObjectGetComm(input_mat, MPI_COMM_MATRIX, ierr) - ! Get the comm size - call MPI_Comm_size(MPI_COMM_MATRIX, comm_size, errorcode) - - ! If doing parallel Kokkos - if (mat_type == MATMPIAIJKOKKOS .OR. mat_type == MATSEQAIJKOKKOS .OR. & - mat_type == MATAIJKOKKOS) then - - ! Are we reusing - reuse_logical = reuse == MAT_REUSE_MATRIX - reuse_int = 0 - if (reuse_logical) reuse_int = 1 - - A_array = input_mat%v - B_array = output_mat%v - is_row_ptr = is_row%v - is_col_ptr = is_col%v - - our_level_int = -1 - is_row_fine_int = 0 - is_col_fine_int = 0 - - if (present(our_level)) then - our_level_int = our_level - end if - if (present(is_row_fine)) then - if (is_row_fine) is_row_fine_int = 1 - end if - if (present(is_col_fine)) then - if (is_col_fine) is_col_fine_int = 1 - end if - - call MatCreateSubMatrix_kokkos(A_array, is_row_ptr, is_col_ptr, & - reuse_int, B_array, & - our_level_int, is_row_fine_int, is_col_fine_int) - - output_mat%v = B_array - - ! If debugging do a comparison between CPU and Kokkos results - if (kokkos_debug()) then - - call MatCreateSubMatrix(input_mat, is_row, is_col, & - MAT_INITIAL_MATRIX, temp_mat, ierr) - - call MatAXPY(temp_mat, -1d0, output_mat, DIFFERENT_NONZERO_PATTERN, ierr) - ! Find the biggest entry in the difference - call MatCreateVecs(temp_mat, PETSC_NULL_VEC, max_vec, ierr) - call MatGetRowMaxAbs(temp_mat, max_vec, PETSC_NULL_INTEGER_POINTER, ierr) - call VecMax(max_vec, row_loc, normy, ierr) - call VecDestroy(max_vec, ierr) - - if (normy .gt. 1d-12 .OR. normy/=normy) then - !call MatFilter(temp_mat, 1d-14, PETSC_TRUE, PETSC_FALSE, ierr) - !call MatView(temp_mat, PETSC_VIEWER_STDOUT_WORLD, ierr) - print *, "Diff Kokkos and CPU MatCreateSubMatrix", normy, "row", row_loc - call MPI_Abort(MPI_COMM_WORLD, MPI_ERR_OTHER, errorcode) - end if - call MatDestroy(temp_mat, ierr) - end if - - else - - call MatCreateSubMatrix(input_mat, is_row, is_col, & - reuse, output_mat, ierr) - - end if -#else +! #if defined(PETSC_HAVE_KOKKOS) + +! call MatGetType(input_mat, mat_type, ierr) +! call PetscObjectGetComm(input_mat, MPI_COMM_MATRIX, ierr) +! ! Get the comm size +! call MPI_Comm_size(MPI_COMM_MATRIX, comm_size, errorcode) + +! ! If doing parallel Kokkos +! if (mat_type == MATMPIAIJKOKKOS .OR. mat_type == MATSEQAIJKOKKOS .OR. & +! mat_type == MATAIJKOKKOS) then + +! ! Are we reusing +! reuse_logical = reuse == MAT_REUSE_MATRIX +! reuse_int = 0 +! if (reuse_logical) reuse_int = 1 + +! A_array = input_mat%v +! B_array = output_mat%v +! is_row_ptr = is_row%v +! is_col_ptr = is_col%v + +! our_level_int = -1 +! is_row_fine_int = 0 +! is_col_fine_int = 0 + +! if (present(our_level)) then +! our_level_int = our_level +! end if +! if (present(is_row_fine)) then +! if (is_row_fine) is_row_fine_int = 1 +! end if +! if (present(is_col_fine)) then +! if (is_col_fine) is_col_fine_int = 1 +! end if + +! call MatCreateSubMatrix_kokkos(A_array, is_row_ptr, is_col_ptr, & +! reuse_int, B_array, & +! our_level_int, is_row_fine_int, is_col_fine_int) + +! output_mat%v = B_array + +! ! If debugging do a comparison between CPU and Kokkos results +! if (kokkos_debug()) then + +! call MatCreateSubMatrix(input_mat, is_row, is_col, & +! MAT_INITIAL_MATRIX, temp_mat, ierr) + +! call MatAXPY(temp_mat, -1d0, output_mat, DIFFERENT_NONZERO_PATTERN, ierr) +! ! Find the biggest entry in the difference +! call MatCreateVecs(temp_mat, PETSC_NULL_VEC, max_vec, ierr) +! call MatGetRowMaxAbs(temp_mat, max_vec, PETSC_NULL_INTEGER_POINTER, ierr) +! call VecMax(max_vec, row_loc, normy, ierr) +! call VecDestroy(max_vec, ierr) + +! if (normy .gt. 1d-12 .OR. normy/=normy) then +! !call MatFilter(temp_mat, 1d-14, PETSC_TRUE, PETSC_FALSE, ierr) +! !call MatView(temp_mat, PETSC_VIEWER_STDOUT_WORLD, ierr) +! print *, "Diff Kokkos and CPU MatCreateSubMatrix", normy, "row", row_loc +! call MPI_Abort(MPI_COMM_WORLD, MPI_ERR_OTHER, errorcode) +! end if +! call MatDestroy(temp_mat, ierr) +! end if + +! else + +! call MatCreateSubMatrix(input_mat, is_row, is_col, & +! reuse, output_mat, ierr) + +! end if +! #else call MatCreateSubMatrix(input_mat, is_row, is_col, & reuse, output_mat, ierr) -#endif +!#endif end subroutine MatCreateSubMatrixWrapper From 21dbc4613a6db1f270659031158ce5212ba5b656 Mon Sep 17 00:00:00 2001 From: sdargavi Date: Tue, 7 Apr 2026 14:50:42 +0100 Subject: [PATCH 37/60] Don't use device copies in kokkos submatrix --- src/AIR_Operators_Setup.F90 | 12 +-- src/PETSc_Helper.F90 | 142 ++++++++++++++++++------------------ 2 files changed, 75 insertions(+), 79 deletions(-) diff --git a/src/AIR_Operators_Setup.F90 b/src/AIR_Operators_Setup.F90 index 6a724012..71c552b4 100644 --- a/src/AIR_Operators_Setup.F90 +++ b/src/AIR_Operators_Setup.F90 @@ -201,21 +201,17 @@ subroutine get_submatrices_start_poly_coeff_comms(input_mat, our_level, air_data REUSE_MAT_ACTIVE(MAT_RAP_DROP, air_data%options%reuse_amount)) then call MatCreateSubMatrixWrapper(input_mat, & air_data%IS_fine_index(our_level), air_data%IS_coarse_index(our_level), MAT_REUSE_MATRIX, & - air_data%A_fc(our_level), & - our_level = our_level, is_row_fine = .TRUE., is_col_fine = .FALSE.) + air_data%A_fc(our_level)) call MatCreateSubMatrixWrapper(input_mat, & air_data%IS_coarse_index(our_level), air_data%IS_fine_index(our_level), MAT_REUSE_MATRIX, & - air_data%A_cf(our_level), & - our_level = our_level, is_row_fine = .FALSE., is_col_fine = .TRUE.) + air_data%A_cf(our_level)) else call MatCreateSubMatrixWrapper(input_mat, & air_data%IS_fine_index(our_level), air_data%IS_coarse_index(our_level), MAT_INITIAL_MATRIX, & - air_data%A_fc(our_level), & - our_level = our_level, is_row_fine = .TRUE., is_col_fine = .FALSE.) + air_data%A_fc(our_level)) call MatCreateSubMatrixWrapper(input_mat, & air_data%IS_coarse_index(our_level), air_data%IS_fine_index(our_level), MAT_INITIAL_MATRIX, & - air_data%A_cf(our_level), & - our_level = our_level, is_row_fine = .FALSE., is_col_fine = .TRUE.) + air_data%A_cf(our_level)) end if ! Only reuse when coarse matrix structure is stable (amount>=2 stores MAT_RAP_DROP) ! if (air_data%allocated_matrices_A_ff(our_level) .AND. & diff --git a/src/PETSc_Helper.F90 b/src/PETSc_Helper.F90 index 6c6bdb01..fd34786f 100644 --- a/src/PETSc_Helper.F90 +++ b/src/PETSc_Helper.F90 @@ -1098,79 +1098,79 @@ subroutine MatCreateSubMatrixWrapper(input_mat, is_row, is_col, & #endif ! ~~~~~~~~~~ -! #if defined(PETSC_HAVE_KOKKOS) - -! call MatGetType(input_mat, mat_type, ierr) -! call PetscObjectGetComm(input_mat, MPI_COMM_MATRIX, ierr) -! ! Get the comm size -! call MPI_Comm_size(MPI_COMM_MATRIX, comm_size, errorcode) - -! ! If doing parallel Kokkos -! if (mat_type == MATMPIAIJKOKKOS .OR. mat_type == MATSEQAIJKOKKOS .OR. & -! mat_type == MATAIJKOKKOS) then - -! ! Are we reusing -! reuse_logical = reuse == MAT_REUSE_MATRIX -! reuse_int = 0 -! if (reuse_logical) reuse_int = 1 - -! A_array = input_mat%v -! B_array = output_mat%v -! is_row_ptr = is_row%v -! is_col_ptr = is_col%v - -! our_level_int = -1 -! is_row_fine_int = 0 -! is_col_fine_int = 0 - -! if (present(our_level)) then -! our_level_int = our_level -! end if -! if (present(is_row_fine)) then -! if (is_row_fine) is_row_fine_int = 1 -! end if -! if (present(is_col_fine)) then -! if (is_col_fine) is_col_fine_int = 1 -! end if - -! call MatCreateSubMatrix_kokkos(A_array, is_row_ptr, is_col_ptr, & -! reuse_int, B_array, & -! our_level_int, is_row_fine_int, is_col_fine_int) - -! output_mat%v = B_array - -! ! If debugging do a comparison between CPU and Kokkos results -! if (kokkos_debug()) then - -! call MatCreateSubMatrix(input_mat, is_row, is_col, & -! MAT_INITIAL_MATRIX, temp_mat, ierr) - -! call MatAXPY(temp_mat, -1d0, output_mat, DIFFERENT_NONZERO_PATTERN, ierr) -! ! Find the biggest entry in the difference -! call MatCreateVecs(temp_mat, PETSC_NULL_VEC, max_vec, ierr) -! call MatGetRowMaxAbs(temp_mat, max_vec, PETSC_NULL_INTEGER_POINTER, ierr) -! call VecMax(max_vec, row_loc, normy, ierr) -! call VecDestroy(max_vec, ierr) - -! if (normy .gt. 1d-12 .OR. normy/=normy) then -! !call MatFilter(temp_mat, 1d-14, PETSC_TRUE, PETSC_FALSE, ierr) -! !call MatView(temp_mat, PETSC_VIEWER_STDOUT_WORLD, ierr) -! print *, "Diff Kokkos and CPU MatCreateSubMatrix", normy, "row", row_loc -! call MPI_Abort(MPI_COMM_WORLD, MPI_ERR_OTHER, errorcode) -! end if -! call MatDestroy(temp_mat, ierr) -! end if - -! else - -! call MatCreateSubMatrix(input_mat, is_row, is_col, & -! reuse, output_mat, ierr) - -! end if -! #else +#if defined(PETSC_HAVE_KOKKOS) + + call MatGetType(input_mat, mat_type, ierr) + call PetscObjectGetComm(input_mat, MPI_COMM_MATRIX, ierr) + ! Get the comm size + call MPI_Comm_size(MPI_COMM_MATRIX, comm_size, errorcode) + + ! If doing parallel Kokkos + if (mat_type == MATMPIAIJKOKKOS .OR. mat_type == MATSEQAIJKOKKOS .OR. & + mat_type == MATAIJKOKKOS) then + + ! Are we reusing + reuse_logical = reuse == MAT_REUSE_MATRIX + reuse_int = 0 + if (reuse_logical) reuse_int = 1 + + A_array = input_mat%v + B_array = output_mat%v + is_row_ptr = is_row%v + is_col_ptr = is_col%v + + our_level_int = -1 + is_row_fine_int = 0 + is_col_fine_int = 0 + + if (present(our_level)) then + our_level_int = our_level + end if + if (present(is_row_fine)) then + if (is_row_fine) is_row_fine_int = 1 + end if + if (present(is_col_fine)) then + if (is_col_fine) is_col_fine_int = 1 + end if + + call MatCreateSubMatrix_kokkos(A_array, is_row_ptr, is_col_ptr, & + reuse_int, B_array, & + our_level_int, is_row_fine_int, is_col_fine_int) + + output_mat%v = B_array + + ! If debugging do a comparison between CPU and Kokkos results + if (kokkos_debug()) then + + call MatCreateSubMatrix(input_mat, is_row, is_col, & + MAT_INITIAL_MATRIX, temp_mat, ierr) + + call MatAXPY(temp_mat, -1d0, output_mat, DIFFERENT_NONZERO_PATTERN, ierr) + ! Find the biggest entry in the difference + call MatCreateVecs(temp_mat, PETSC_NULL_VEC, max_vec, ierr) + call MatGetRowMaxAbs(temp_mat, max_vec, PETSC_NULL_INTEGER_POINTER, ierr) + call VecMax(max_vec, row_loc, normy, ierr) + call VecDestroy(max_vec, ierr) + + if (normy .gt. 1d-12 .OR. normy/=normy) then + !call MatFilter(temp_mat, 1d-14, PETSC_TRUE, PETSC_FALSE, ierr) + !call MatView(temp_mat, PETSC_VIEWER_STDOUT_WORLD, ierr) + print *, "Diff Kokkos and CPU MatCreateSubMatrix", normy, "row", row_loc + call MPI_Abort(MPI_COMM_WORLD, MPI_ERR_OTHER, errorcode) + end if + call MatDestroy(temp_mat, ierr) + end if + + else + + call MatCreateSubMatrix(input_mat, is_row, is_col, & + reuse, output_mat, ierr) + + end if +#else call MatCreateSubMatrix(input_mat, is_row, is_col, & reuse, output_mat, ierr) -!#endif +#endif end subroutine MatCreateSubMatrixWrapper From c3c4da7ab8e0b3e58081649a892cc1621902da8f Mon Sep 17 00:00:00 2001 From: sdargavi Date: Wed, 8 Apr 2026 02:15:02 +0100 Subject: [PATCH 38/60] Add more fences around restoreindices --- src/Grid_Transferk.kokkos.cxx | 1 + src/PETSc_Helperk.kokkos.cxx | 4 ++++ src/VecISCopyLocalk.kokkos.cxx | 6 ++++++ 3 files changed, 11 insertions(+) diff --git a/src/Grid_Transferk.kokkos.cxx b/src/Grid_Transferk.kokkos.cxx index 1fa6ac96..529e6fd8 100644 --- a/src/Grid_Transferk.kokkos.cxx +++ b/src/Grid_Transferk.kokkos.cxx @@ -366,6 +366,7 @@ PETSC_INTERN void compute_P_from_W_kokkos(Mat *input_mat, PetscInt global_row_st PetscCallVoid(PetscLogCpuToGpu(bytes)); bytes = coarse_view_h.extent(0) * sizeof(PetscInt); PetscCallVoid(PetscLogCpuToGpu(bytes)); + Kokkos::fence(); local_cols_coarse = local_rows_coarse; local_cols = local_rows_coarse + local_rows_fine; diff --git a/src/PETSc_Helperk.kokkos.cxx b/src/PETSc_Helperk.kokkos.cxx index 845ac766..4c374b65 100644 --- a/src/PETSc_Helperk.kokkos.cxx +++ b/src/PETSc_Helperk.kokkos.cxx @@ -2517,6 +2517,7 @@ PETSC_INTERN void MatCreateSubMatrix_kokkos_view(Mat *input_mat, PetscIntKokkosV // Log copy with petsc bytes = iscol_o_view_h.extent(0) * sizeof(PetscInt); PetscCallVoid(PetscLogCpuToGpu(bytes)); + Kokkos::fence(); PetscCallVoid(ISRestoreIndices(iscol_o, &iscol_o_indices_ptr)); } @@ -2629,6 +2630,9 @@ PETSC_INTERN void MatCreateSubMatrix_kokkos(Mat *input_mat, IS *is_row, IS *is_c // Copy indices to the device Kokkos::deep_copy(exec, is_row_d_d, is_row_view_h); Kokkos::deep_copy(exec, is_col_d_d, is_col_view_h); + // The source pointers come from ISGetIndices; ensure async copies complete + // before restoring those host buffers. + Kokkos::fence(); // Log copy with petsc size_t bytes = is_row_view_h.extent(0) * sizeof(PetscInt); PetscCallVoid(PetscLogCpuToGpu(bytes)); diff --git a/src/VecISCopyLocalk.kokkos.cxx b/src/VecISCopyLocalk.kokkos.cxx index 2cd37d96..31652ec7 100644 --- a/src/VecISCopyLocalk.kokkos.cxx +++ b/src/VecISCopyLocalk.kokkos.cxx @@ -86,6 +86,9 @@ PETSC_INTERN void set_VecISCopyLocal_kokkos_our_level(int our_level, PetscInt gl IS_fine_views_local[level_idx] = std::make_shared("IS_fine_view_" + std::to_string(our_level), fine_local_size); // Copy the indices over to the device Kokkos::deep_copy(exec, *IS_fine_views_local[level_idx], fine_view_h); + // The source pointer is owned by ISGetIndices; make sure copy completed + // before restoring that host buffer. + Kokkos::fence(); // Log copy with petsc size_t bytes = fine_view_h.extent(0) * sizeof(PetscInt); PetscCallVoid(PetscLogCpuToGpu(bytes)); @@ -105,6 +108,9 @@ PETSC_INTERN void set_VecISCopyLocal_kokkos_our_level(int our_level, PetscInt gl IS_coarse_views_local[level_idx] = std::make_shared("IS_coarse_view_" + std::to_string(our_level), coarse_local_size); // Copy the indices over to the device Kokkos::deep_copy(exec, *IS_coarse_views_local[level_idx], coarse_view_h); + // The source pointer is owned by ISGetIndices; make sure copy completed + // before restoring that host buffer. + Kokkos::fence(); // Log copy with petsc bytes = coarse_view_h.extent(0) * sizeof(PetscInt); PetscCallVoid(PetscLogCpuToGpu(bytes)); From 6f07ad6cc5f5fe6df535d4f9505fbb4805b3f5f1 Mon Sep 17 00:00:00 2001 From: sdargavi Date: Wed, 8 Apr 2026 04:03:09 +0100 Subject: [PATCH 39/60] Hypothesis testing --- src/PETSc_Helperk.kokkos.cxx | 76 +++++++++++++++++++++++++++++++++++- 1 file changed, 75 insertions(+), 1 deletion(-) diff --git a/src/PETSc_Helperk.kokkos.cxx b/src/PETSc_Helperk.kokkos.cxx index 4c374b65..3a6c8665 100644 --- a/src/PETSc_Helperk.kokkos.cxx +++ b/src/PETSc_Helperk.kokkos.cxx @@ -1970,7 +1970,57 @@ PETSC_INTERN void MatCreateSubMatrix_Seq_kokkos(Mat *input_mat, PetscIntKokkosVi PetscCallVoid(MatGetLocalSize(*input_mat, &local_rows, &local_cols)); PetscInt local_rows_row = is_row_d_d.extent(0), local_cols_col = is_col_d_d.extent(0); - + + // ~~~~~~~~~~~~ + // DIAGNOSTIC (Step 1 of plan): verify is_row_d_d / is_col_d_d are in-bounds. + // If a caller supplies out-of-range indices, smap_d / device_local_i accesses + // below would silently clobber adjacent device allocations. + // ~~~~~~~~~~~~ + { + PetscInt row_min = 0, row_max = -1, col_min = 0, col_max = -1; + if (local_rows_row > 0) { + Kokkos::parallel_reduce("PFLARE_DBG_is_row_minmax", + Kokkos::RangePolicy<>(exec, 0, local_rows_row), + KOKKOS_LAMBDA(const PetscInt i, PetscInt &lmin) { + const PetscInt v = is_row_d_d(i); + if (v < lmin) lmin = v; + }, Kokkos::Min(row_min)); + Kokkos::parallel_reduce("PFLARE_DBG_is_row_max", + Kokkos::RangePolicy<>(exec, 0, local_rows_row), + KOKKOS_LAMBDA(const PetscInt i, PetscInt &lmax) { + const PetscInt v = is_row_d_d(i); + if (v > lmax) lmax = v; + }, Kokkos::Max(row_max)); + } + if (local_cols_col > 0) { + Kokkos::parallel_reduce("PFLARE_DBG_is_col_minmax", + Kokkos::RangePolicy<>(exec, 0, local_cols_col), + KOKKOS_LAMBDA(const PetscInt i, PetscInt &lmin) { + const PetscInt v = is_col_d_d(i); + if (v < lmin) lmin = v; + }, Kokkos::Min(col_min)); + Kokkos::parallel_reduce("PFLARE_DBG_is_col_max", + Kokkos::RangePolicy<>(exec, 0, local_cols_col), + KOKKOS_LAMBDA(const PetscInt i, PetscInt &lmax) { + const PetscInt v = is_col_d_d(i); + if (v > lmax) lmax = v; + }, Kokkos::Max(col_max)); + } + Kokkos::fence(); + if (local_rows_row > 0) { + PetscCheckAbort(row_min >= 0 && row_max < local_rows, PETSC_COMM_SELF, + PETSC_ERR_ARG_OUTOFRANGE, + "MatCreateSubMatrix_Seq_kokkos: is_row out of range [0,%" PetscInt_FMT ") got [%" PetscInt_FMT ",%" PetscInt_FMT "]", + local_rows, row_min, row_max); + } + if (local_cols_col > 0) { + PetscCheckAbort(col_min >= 0 && col_max < local_cols, PETSC_COMM_SELF, + PETSC_ERR_ARG_OUTOFRANGE, + "MatCreateSubMatrix_Seq_kokkos: is_col out of range [0,%" PetscInt_FMT ") got [%" PetscInt_FMT ",%" PetscInt_FMT "]", + local_cols, col_min, col_max); + } + } + // ~~~~~~~~~~~~ // Get pointers to the i,j,vals on the device // ~~~~~~~~~~~~ @@ -2457,6 +2507,30 @@ PETSC_INTERN void MatCreateSubMatrix_kokkos_view(Mat *input_mat, PetscIntKokkosV partial_sum += input_value; // Update running total }); + // ~~~~~~~~~~~~ + // DIAGNOSTIC (Step 1 of plan): the parallel_reduce above produced + // col_ao_output on the host while the scan produced the per-index + // prefix sum on device. They must agree on the total count; if they + // don't, the size of is_col_o_d / garray_output_d below is wrong and + // the subsequent scatter kernel will write out of bounds. + // ~~~~~~~~~~~~ + { + PetscInt scan_total_h = 0; + auto tail_sv = Kokkos::subview(is_col_o_match_d, cols_ao); + Kokkos::View tail_h("PFLARE_DBG_scan_tail"); + Kokkos::deep_copy(exec, tail_h, tail_sv); + Kokkos::fence(); + scan_total_h = tail_h(); + PetscCheckAbort(scan_total_h == col_ao_output, MPI_COMM_MATRIX, + PETSC_ERR_PLIB, + "MatCreateSubMatrix_kokkos_view: parallel_reduce count (%" PetscInt_FMT ") disagrees with scan total (%" PetscInt_FMT "), cols_ao=%" PetscInt_FMT, + col_ao_output, scan_total_h, cols_ao); + PetscCheckAbort(col_ao_output >= 0 && col_ao_output <= cols_ao, MPI_COMM_MATRIX, + PETSC_ERR_PLIB, + "MatCreateSubMatrix_kokkos_view: col_ao_output=%" PetscInt_FMT " outside [0,%" PetscInt_FMT "]", + col_ao_output, cols_ao); + } + // Local indices into input garray of the columns we want to keep // but remember this doesn't mean garray_output = garray_input(is_col_o_d) // as the of columns we have in the output has changed, ie we need From fd33a7a2f0f04f48244b58484689d0df44cbb4ef Mon Sep 17 00:00:00 2001 From: sdargavi Date: Wed, 8 Apr 2026 04:16:36 +0100 Subject: [PATCH 40/60] More bounds checking --- src/PETSc_Helperk.kokkos.cxx | 56 ++++++++++++++++++++++++++++++++++-- 1 file changed, 53 insertions(+), 3 deletions(-) diff --git a/src/PETSc_Helperk.kokkos.cxx b/src/PETSc_Helperk.kokkos.cxx index 3a6c8665..17ac011b 100644 --- a/src/PETSc_Helperk.kokkos.cxx +++ b/src/PETSc_Helperk.kokkos.cxx @@ -2152,11 +2152,61 @@ PETSC_INTERN void MatCreateSubMatrix_Seq_kokkos(Mat *input_mat, PetscIntKokkosVi // Create i indices // ~~~~~~~~~~~~~~~ Kokkos::parallel_for( - Kokkos::RangePolicy<>(exec, 0, local_rows_row), KOKKOS_LAMBDA(PetscInt i_idx_is_row) { + Kokkos::RangePolicy<>(exec, 0, local_rows_row), KOKKOS_LAMBDA(PetscInt i_idx_is_row) { // The start of our row index comes from the scan - i_local_d(i_idx_is_row + 1) = nnz_match_local_row_d(i_idx_is_row); - }); + i_local_d(i_idx_is_row + 1) = nnz_match_local_row_d(i_idx_is_row); + }); + + // ~~~~~~~~~~~~ + // DIAGNOSTIC (Step 1b of plan): verify i_local_d's final value equals + // nnzs_match_local, and that device_local_j entries for the rows we touch + // are all inside [0, local_cols). Either inconsistency would cause the + // team kernel below to write j_local_d / a_local_d outside their bounds. + // ~~~~~~~~~~~~ + if (local_rows_row > 0) { + PetscInt i_local_last_h = 0; + auto i_local_tail = Kokkos::subview(i_local_d, local_rows_row); + Kokkos::View i_local_tail_h("PFLARE_DBG_i_local_tail"); + Kokkos::deep_copy(exec, i_local_tail_h, i_local_tail); + Kokkos::fence(); + i_local_last_h = i_local_tail_h(); + PetscCheckAbort(i_local_last_h == nnzs_match_local, PETSC_COMM_SELF, + PETSC_ERR_PLIB, + "MatCreateSubMatrix_Seq_kokkos: i_local_d tail (%" PetscInt_FMT ") != nnzs_match_local (%" PetscInt_FMT "), local_rows_row=%" PetscInt_FMT, + i_local_last_h, nnzs_match_local, local_rows_row); + + PetscInt jmin = 0, jmax = -1; + Kokkos::parallel_reduce("PFLARE_DBG_dev_j_min", + Kokkos::RangePolicy<>(exec, 0, local_rows_row), + KOKKOS_LAMBDA(const PetscInt ir, PetscInt &lmin) { + const PetscInt i = is_row_d_d(ir); + const PetscInt s = device_local_i[i]; + const PetscInt e = device_local_i[i + 1]; + for (PetscInt k = s; k < e; ++k) { + const PetscInt v = device_local_j[k]; + if (v < lmin) lmin = v; + } + }, Kokkos::Min(jmin)); + Kokkos::parallel_reduce("PFLARE_DBG_dev_j_max", + Kokkos::RangePolicy<>(exec, 0, local_rows_row), + KOKKOS_LAMBDA(const PetscInt ir, PetscInt &lmax) { + const PetscInt i = is_row_d_d(ir); + const PetscInt s = device_local_i[i]; + const PetscInt e = device_local_i[i + 1]; + for (PetscInt k = s; k < e; ++k) { + const PetscInt v = device_local_j[k]; + if (v > lmax) lmax = v; + } + }, Kokkos::Max(jmax)); + Kokkos::fence(); + if (jmax >= 0) { + PetscCheckAbort(jmin >= 0 && jmax < local_cols, PETSC_COMM_SELF, + PETSC_ERR_PLIB, + "MatCreateSubMatrix_Seq_kokkos: device_local_j out of [0,%" PetscInt_FMT ") got [%" PetscInt_FMT ",%" PetscInt_FMT "]", + local_cols, jmin, jmax); + } + } // Execute with scratch memory Kokkos::parallel_for(policy, KOKKOS_LAMBDA(const KokkosTeamMemberType& t) { From 76ed5c3341d134c4ba71657b4e70a3215e267dde Mon Sep 17 00:00:00 2001 From: sdargavi Date: Wed, 8 Apr 2026 19:45:42 +0100 Subject: [PATCH 41/60] Disable local seq create matrix in kokkos --- src/PETSc_Helperk.kokkos.cxx | 84 +++++++++++++++++++++++++++++++++++- 1 file changed, 82 insertions(+), 2 deletions(-) diff --git a/src/PETSc_Helperk.kokkos.cxx b/src/PETSc_Helperk.kokkos.cxx index 17ac011b..ef41423e 100644 --- a/src/PETSc_Helperk.kokkos.cxx +++ b/src/PETSc_Helperk.kokkos.cxx @@ -2267,10 +2267,37 @@ PETSC_INTERN void MatCreateSubMatrix_Seq_kokkos(Mat *input_mat, PetscIntKokkosVi { // Be careful to use the correct i_idx_is_row index into i_local_d here j_local_d(i_local_d(i_idx_is_row) + scratch_indices(j)) = smap_d(device_local_j[device_local_i[i] + j]) - 1; - a_local_d(i_local_d(i_idx_is_row) + scratch_indices(j)) = device_local_vals[device_local_i[i] + j]; + a_local_d(i_local_d(i_idx_is_row) + scratch_indices(j)) = device_local_vals[device_local_i[i] + j]; } }); - }); + }); + + // ~~~~~~~~~~~~ + // DIAGNOSTIC (Step 1c of plan): post-team-kernel sanity check on the + // produced j_local_d. Every column index handed to PETSc must be in + // [0, local_cols_col); a value outside that range would either be a + // smap_d corruption or a per-row scan / write-offset bug. + // ~~~~~~~~~~~~ + if (nnzs_match_local > 0) { + PetscInt jout_min = 0, jout_max = -1; + Kokkos::parallel_reduce("PFLARE_DBG_jlocal_min", + Kokkos::RangePolicy<>(exec, 0, nnzs_match_local), + KOKKOS_LAMBDA(const PetscInt k, PetscInt &lmin) { + const PetscInt v = j_local_d(k); + if (v < lmin) lmin = v; + }, Kokkos::Min(jout_min)); + Kokkos::parallel_reduce("PFLARE_DBG_jlocal_max", + Kokkos::RangePolicy<>(exec, 0, nnzs_match_local), + KOKKOS_LAMBDA(const PetscInt k, PetscInt &lmax) { + const PetscInt v = j_local_d(k); + if (v > lmax) lmax = v; + }, Kokkos::Max(jout_max)); + Kokkos::fence(); + PetscCheckAbort(jout_min >= 0 && jout_max < local_cols_col, PETSC_COMM_SELF, + PETSC_ERR_PLIB, + "MatCreateSubMatrix_Seq_kokkos: j_local_d out of [0,%" PetscInt_FMT ") got [%" PetscInt_FMT ",%" PetscInt_FMT "], nnzs=%" PetscInt_FMT, + local_cols_col, jout_min, jout_max, nnzs_match_local); + } } // If we're reusing, we can just write directly to the existing views else @@ -2430,8 +2457,61 @@ PETSC_INTERN void MatCreateSubMatrix_kokkos_view(Mat *input_mat, PetscIntKokkosV } size_t bytes = 0; +// Ablation toggle (Step 2 of plan): when defined non-zero, the diagonal +// MatCreateSubMatrix_Seq_kokkos call is replaced by PETSc's host-side +// MatCreateSubMatrix on mat_local plus a MatConvert back to MATSEQAIJKOKKOS. +// Used to test whether the intermittent GPU crash originates inside the +// diag Seq_kokkos kernel chain. Reuse path is unchanged (crashes are +// first-call only). Toggle off (set to 0) to restore the original path. +#ifndef PFLARE_ABLATE_DIAG_SUBMAT +#define PFLARE_ABLATE_DIAG_SUBMAT 1 +#endif + // The diagonal component +#if PFLARE_ABLATE_DIAG_SUBMAT + if (!reuse_int) + { + // Pull the (already-local) is_row / is_col indices back to the host so + // PETSc's CPU MatCreateSubMatrix can consume them. mat_local is a + // SeqAIJKokkos but PETSc's MatCreateSubMatrix dispatches to the host + // SeqAIJ implementation, producing a SeqAIJ result that we then convert + // back to SeqAIJKokkos for the downstream MatCreateMPIAIJWithSeqAIJ. + const PetscInt n_row_h = is_row_d_d.extent(0); + const PetscInt n_col_h = is_col_d_d.extent(0); + PetscInt *is_row_host_arr = NULL, *is_col_host_arr = NULL; + PetscCallVoid(PetscMalloc1(n_row_h > 0 ? n_row_h : 1, &is_row_host_arr)); + PetscCallVoid(PetscMalloc1(n_col_h > 0 ? n_col_h : 1, &is_col_host_arr)); + PetscIntKokkosViewHost is_row_h_view(is_row_host_arr, n_row_h); + PetscIntKokkosViewHost is_col_h_view(is_col_host_arr, n_col_h); + Kokkos::deep_copy(exec, is_row_h_view, is_row_d_d); + Kokkos::deep_copy(exec, is_col_h_view, is_col_d_d); + Kokkos::fence(); + + IS is_row_temp = NULL, is_col_temp = NULL; + PetscCallVoid(ISCreateGeneral(PETSC_COMM_SELF, n_row_h, is_row_host_arr, PETSC_COPY_VALUES, &is_row_temp)); + PetscCallVoid(ISCreateGeneral(PETSC_COMM_SELF, n_col_h, is_col_host_arr, PETSC_COPY_VALUES, &is_col_temp)); + + Mat tmp_host_mat = NULL; + PetscCallVoid(MatCreateSubMatrix(mat_local, is_row_temp, is_col_temp, MAT_INITIAL_MATRIX, &output_mat_local)); + // Convert the SeqAIJ host result to SeqAIJKokkos so the downstream + // MatCreateMPIAIJWithSeqAIJ + reuse storage hand-off still get a Kokkos + // seq block (matches what MatCreateSubMatrix_Seq_kokkos would have + // produced). + //PetscCallVoid(MatConvert(tmp_host_mat, MATSEQAIJKOKKOS, MAT_INITIAL_MATRIX, &output_mat_local)); + + //PetscCallVoid(MatDestroy(&tmp_host_mat)); + PetscCallVoid(ISDestroy(&is_row_temp)); + PetscCallVoid(ISDestroy(&is_col_temp)); + PetscCallVoid(PetscFree(is_row_host_arr)); + PetscCallVoid(PetscFree(is_col_host_arr)); + } + else + { + MatCreateSubMatrix_Seq_kokkos(&mat_local, is_row_d_d, is_col_d_d, reuse_int, &output_mat_local); + } +#else MatCreateSubMatrix_Seq_kokkos(&mat_local, is_row_d_d, is_col_d_d, reuse_int, &output_mat_local); +#endif // The off-diagonal component requires some comms // Basically a copy of MatCreateSubMatrix_MPIAIJ_SameRowColDist From 944d18ba1df5b3609c78ca4ee3706e6fd16288c0 Mon Sep 17 00:00:00 2001 From: sdargavi Date: Wed, 8 Apr 2026 20:04:10 +0100 Subject: [PATCH 42/60] Replacement --- src/PETSc_Helperk.kokkos.cxx | 53 +++++++++++++++++++++++++++++++++++- 1 file changed, 52 insertions(+), 1 deletion(-) diff --git a/src/PETSc_Helperk.kokkos.cxx b/src/PETSc_Helperk.kokkos.cxx index ef41423e..df2e008b 100644 --- a/src/PETSc_Helperk.kokkos.cxx +++ b/src/PETSc_Helperk.kokkos.cxx @@ -2464,7 +2464,7 @@ PETSC_INTERN void MatCreateSubMatrix_kokkos_view(Mat *input_mat, PetscIntKokkosV // diag Seq_kokkos kernel chain. Reuse path is unchanged (crashes are // first-call only). Toggle off (set to 0) to restore the original path. #ifndef PFLARE_ABLATE_DIAG_SUBMAT -#define PFLARE_ABLATE_DIAG_SUBMAT 1 +#define PFLARE_ABLATE_DIAG_SUBMAT 0 #endif // The diagonal component @@ -2515,8 +2515,59 @@ PETSC_INTERN void MatCreateSubMatrix_kokkos_view(Mat *input_mat, PetscIntKokkosV // The off-diagonal component requires some comms // Basically a copy of MatCreateSubMatrix_MPIAIJ_SameRowColDist + +// Off-diagonal ablation toggle (step 2a of plan): when non-zero, the entire +// off-diag VecScatter + Seq_kokkos-nonlocal + MatCreateMPIAIJWithSeqAIJ path +// is replaced by PETSc's CPU MatCreateSubMatrix on the full MPIAIJ input, +// converted back to MATMPIAIJKOKKOS. Combine with PFLARE_ABLATE_DIAG_SUBMAT=0 +// so that only the off-diag section is ablated while diag uses our Kokkos kernel. +// Only the first-call (non-reuse) path is ablated, matching the observed failure mode. +#ifndef PFLARE_ABLATE_OFFDIAG_SUBMAT +#define PFLARE_ABLATE_OFFDIAG_SUBMAT 1 +#endif + if (mpi) { +#if PFLARE_ABLATE_OFFDIAG_SUBMAT + if (!reuse_int) + { + // We need global IS indices (is_row/is_col on device are already LOCAL, + // i.e. row_global - global_row_start; add back the offset before calling + // PETSc's CPU MatCreateSubMatrix which expects global indices). + PetscInt global_row_start_abl = 0, global_row_end_abl = 0; + PetscInt global_col_start_abl = 0, global_col_end_abl = 0; + PetscCallVoid(MatGetOwnershipRange(*input_mat, &global_row_start_abl, &global_row_end_abl)); + PetscCallVoid(MatGetOwnershipRangeColumn(*input_mat, &global_col_start_abl, &global_col_end_abl)); + + const PetscInt n_row_abl = (PetscInt)is_row_d_d.extent(0); + const PetscInt n_col_abl = (PetscInt)is_col_d_d.extent(0); + PetscInt *is_row_g_arr = NULL, *is_col_g_arr = NULL; + PetscCallVoid(PetscMalloc1(n_row_abl > 0 ? n_row_abl : 1, &is_row_g_arr)); + PetscCallVoid(PetscMalloc1(n_col_abl > 0 ? n_col_abl : 1, &is_col_g_arr)); + + // Copy local device indices to host then shift back to global. + PetscIntKokkosViewHost is_row_g_h(is_row_g_arr, n_row_abl); + PetscIntKokkosViewHost is_col_g_h(is_col_g_arr, n_col_abl); + Kokkos::deep_copy(exec, is_row_g_h, is_row_d_d); + Kokkos::deep_copy(exec, is_col_g_h, is_col_d_d); + Kokkos::fence(); + for (PetscInt ii = 0; ii < n_row_abl; ii++) is_row_g_arr[ii] += global_row_start_abl; + for (PetscInt ii = 0; ii < n_col_abl; ii++) is_col_g_arr[ii] += global_col_start_abl; + + IS is_row_g_abl = NULL, is_col_g_abl = NULL; + PetscCallVoid(ISCreateGeneral(MPI_COMM_MATRIX, n_row_abl, is_row_g_arr, PETSC_OWN_POINTER, &is_row_g_abl)); + PetscCallVoid(ISCreateGeneral(MPI_COMM_MATRIX, n_col_abl, is_col_g_arr, PETSC_OWN_POINTER, &is_col_g_abl)); + + Mat tmp_abl = NULL; + PetscCallVoid(MatCreateSubMatrix(*input_mat, is_row_g_abl, is_col_g_abl, MAT_INITIAL_MATRIX, output_mat)); + //PetscCallVoid(MatConvert(tmp_abl, MATMPIAIJKOKKOS, MAT_INITIAL_MATRIX, output_mat)); + //PetscCallVoid(MatDestroy(&tmp_abl)); + PetscCallVoid(MatDestroy(&output_mat_local)); // diag mat no longer needed + PetscCallVoid(ISDestroy(&is_row_g_abl)); + PetscCallVoid(ISDestroy(&is_col_g_abl)); + return; + } +#endif PetscIntKokkosView is_col_o_d, garray_output_d; if (!reuse_int) From e66eb87401d3c088fff5d2058da9d3f64fcd667c Mon Sep 17 00:00:00 2001 From: sdargavi Date: Wed, 8 Apr 2026 20:26:30 +0100 Subject: [PATCH 43/60] Test cpu only in kokkos --- src/PETSc_Helperk.kokkos.cxx | 94 ++++++++++++++++++------------------ 1 file changed, 47 insertions(+), 47 deletions(-) diff --git a/src/PETSc_Helperk.kokkos.cxx b/src/PETSc_Helperk.kokkos.cxx index df2e008b..2fd9daec 100644 --- a/src/PETSc_Helperk.kokkos.cxx +++ b/src/PETSc_Helperk.kokkos.cxx @@ -2464,54 +2464,54 @@ PETSC_INTERN void MatCreateSubMatrix_kokkos_view(Mat *input_mat, PetscIntKokkosV // diag Seq_kokkos kernel chain. Reuse path is unchanged (crashes are // first-call only). Toggle off (set to 0) to restore the original path. #ifndef PFLARE_ABLATE_DIAG_SUBMAT -#define PFLARE_ABLATE_DIAG_SUBMAT 0 +#define PFLARE_ABLATE_DIAG_SUBMAT 1 #endif - // The diagonal component -#if PFLARE_ABLATE_DIAG_SUBMAT - if (!reuse_int) - { - // Pull the (already-local) is_row / is_col indices back to the host so - // PETSc's CPU MatCreateSubMatrix can consume them. mat_local is a - // SeqAIJKokkos but PETSc's MatCreateSubMatrix dispatches to the host - // SeqAIJ implementation, producing a SeqAIJ result that we then convert - // back to SeqAIJKokkos for the downstream MatCreateMPIAIJWithSeqAIJ. - const PetscInt n_row_h = is_row_d_d.extent(0); - const PetscInt n_col_h = is_col_d_d.extent(0); - PetscInt *is_row_host_arr = NULL, *is_col_host_arr = NULL; - PetscCallVoid(PetscMalloc1(n_row_h > 0 ? n_row_h : 1, &is_row_host_arr)); - PetscCallVoid(PetscMalloc1(n_col_h > 0 ? n_col_h : 1, &is_col_host_arr)); - PetscIntKokkosViewHost is_row_h_view(is_row_host_arr, n_row_h); - PetscIntKokkosViewHost is_col_h_view(is_col_host_arr, n_col_h); - Kokkos::deep_copy(exec, is_row_h_view, is_row_d_d); - Kokkos::deep_copy(exec, is_col_h_view, is_col_d_d); - Kokkos::fence(); - - IS is_row_temp = NULL, is_col_temp = NULL; - PetscCallVoid(ISCreateGeneral(PETSC_COMM_SELF, n_row_h, is_row_host_arr, PETSC_COPY_VALUES, &is_row_temp)); - PetscCallVoid(ISCreateGeneral(PETSC_COMM_SELF, n_col_h, is_col_host_arr, PETSC_COPY_VALUES, &is_col_temp)); - - Mat tmp_host_mat = NULL; - PetscCallVoid(MatCreateSubMatrix(mat_local, is_row_temp, is_col_temp, MAT_INITIAL_MATRIX, &output_mat_local)); - // Convert the SeqAIJ host result to SeqAIJKokkos so the downstream - // MatCreateMPIAIJWithSeqAIJ + reuse storage hand-off still get a Kokkos - // seq block (matches what MatCreateSubMatrix_Seq_kokkos would have - // produced). - //PetscCallVoid(MatConvert(tmp_host_mat, MATSEQAIJKOKKOS, MAT_INITIAL_MATRIX, &output_mat_local)); - - //PetscCallVoid(MatDestroy(&tmp_host_mat)); - PetscCallVoid(ISDestroy(&is_row_temp)); - PetscCallVoid(ISDestroy(&is_col_temp)); - PetscCallVoid(PetscFree(is_row_host_arr)); - PetscCallVoid(PetscFree(is_col_host_arr)); - } - else - { - MatCreateSubMatrix_Seq_kokkos(&mat_local, is_row_d_d, is_col_d_d, reuse_int, &output_mat_local); - } -#else - MatCreateSubMatrix_Seq_kokkos(&mat_local, is_row_d_d, is_col_d_d, reuse_int, &output_mat_local); -#endif +// // The diagonal component +// #if PFLARE_ABLATE_DIAG_SUBMAT +// if (!reuse_int) +// { +// // Pull the (already-local) is_row / is_col indices back to the host so +// // PETSc's CPU MatCreateSubMatrix can consume them. mat_local is a +// // SeqAIJKokkos but PETSc's MatCreateSubMatrix dispatches to the host +// // SeqAIJ implementation, producing a SeqAIJ result that we then convert +// // back to SeqAIJKokkos for the downstream MatCreateMPIAIJWithSeqAIJ. +// const PetscInt n_row_h = is_row_d_d.extent(0); +// const PetscInt n_col_h = is_col_d_d.extent(0); +// PetscInt *is_row_host_arr = NULL, *is_col_host_arr = NULL; +// PetscCallVoid(PetscMalloc1(n_row_h > 0 ? n_row_h : 1, &is_row_host_arr)); +// PetscCallVoid(PetscMalloc1(n_col_h > 0 ? n_col_h : 1, &is_col_host_arr)); +// PetscIntKokkosViewHost is_row_h_view(is_row_host_arr, n_row_h); +// PetscIntKokkosViewHost is_col_h_view(is_col_host_arr, n_col_h); +// Kokkos::deep_copy(exec, is_row_h_view, is_row_d_d); +// Kokkos::deep_copy(exec, is_col_h_view, is_col_d_d); +// Kokkos::fence(); + +// IS is_row_temp = NULL, is_col_temp = NULL; +// PetscCallVoid(ISCreateGeneral(PETSC_COMM_SELF, n_row_h, is_row_host_arr, PETSC_COPY_VALUES, &is_row_temp)); +// PetscCallVoid(ISCreateGeneral(PETSC_COMM_SELF, n_col_h, is_col_host_arr, PETSC_COPY_VALUES, &is_col_temp)); + +// Mat tmp_host_mat = NULL; +// PetscCallVoid(MatCreateSubMatrix(mat_local, is_row_temp, is_col_temp, MAT_INITIAL_MATRIX, &output_mat_local)); +// // Convert the SeqAIJ host result to SeqAIJKokkos so the downstream +// // MatCreateMPIAIJWithSeqAIJ + reuse storage hand-off still get a Kokkos +// // seq block (matches what MatCreateSubMatrix_Seq_kokkos would have +// // produced). +// //PetscCallVoid(MatConvert(tmp_host_mat, MATSEQAIJKOKKOS, MAT_INITIAL_MATRIX, &output_mat_local)); + +// //PetscCallVoid(MatDestroy(&tmp_host_mat)); +// PetscCallVoid(ISDestroy(&is_row_temp)); +// PetscCallVoid(ISDestroy(&is_col_temp)); +// PetscCallVoid(PetscFree(is_row_host_arr)); +// PetscCallVoid(PetscFree(is_col_host_arr)); +// } +// else +// { +// MatCreateSubMatrix_Seq_kokkos(&mat_local, is_row_d_d, is_col_d_d, reuse_int, &output_mat_local); +// } +// #else +// MatCreateSubMatrix_Seq_kokkos(&mat_local, is_row_d_d, is_col_d_d, reuse_int, &output_mat_local); +// #endif // The off-diagonal component requires some comms // Basically a copy of MatCreateSubMatrix_MPIAIJ_SameRowColDist @@ -2562,7 +2562,7 @@ PETSC_INTERN void MatCreateSubMatrix_kokkos_view(Mat *input_mat, PetscIntKokkosV PetscCallVoid(MatCreateSubMatrix(*input_mat, is_row_g_abl, is_col_g_abl, MAT_INITIAL_MATRIX, output_mat)); //PetscCallVoid(MatConvert(tmp_abl, MATMPIAIJKOKKOS, MAT_INITIAL_MATRIX, output_mat)); //PetscCallVoid(MatDestroy(&tmp_abl)); - PetscCallVoid(MatDestroy(&output_mat_local)); // diag mat no longer needed + //PetscCallVoid(MatDestroy(&output_mat_local)); // diag mat no longer needed PetscCallVoid(ISDestroy(&is_row_g_abl)); PetscCallVoid(ISDestroy(&is_col_g_abl)); return; From afa76fd3a5333f9c15c61bdc5878c2945b249c93 Mon Sep 17 00:00:00 2001 From: sdargavi Date: Wed, 8 Apr 2026 20:47:12 +0100 Subject: [PATCH 44/60] Extra assert --- src/PETSc_Helperk.kokkos.cxx | 30 ++++++++++++++++++++---------- 1 file changed, 20 insertions(+), 10 deletions(-) diff --git a/src/PETSc_Helperk.kokkos.cxx b/src/PETSc_Helperk.kokkos.cxx index 2fd9daec..830b2a5f 100644 --- a/src/PETSc_Helperk.kokkos.cxx +++ b/src/PETSc_Helperk.kokkos.cxx @@ -2210,22 +2210,29 @@ PETSC_INTERN void MatCreateSubMatrix_Seq_kokkos(Mat *input_mat, PetscIntKokkosVi // Execute with scratch memory Kokkos::parallel_for(policy, KOKKOS_LAMBDA(const KokkosTeamMemberType& t) { - + // i_idx_is_row is the row index into the output const PetscInt i_idx_is_row = t.league_rank(); // i is the row index into the input - const PetscInt i = is_row_d_d(i_idx_is_row); + const PetscInt i = is_row_d_d(i_idx_is_row); // number of columns PetscInt ncols_local; ncols_local = device_local_i[i + 1] - device_local_i[i]; ScratchIntView scratch_indices; + // DIAGNOSTIC: ncols_local must not exceed max_nnz_local. + // If it does the scratch allocation below overruns the per-team + // budget and silently corrupts adjacent device memory. + // Use Kokkos::abort (not KOKKOS_ASSERT) so this fires unconditionally + // regardless of NDEBUG / KOKKOS_ENABLE_DEBUG build flags. + if (ncols_local > max_nnz_local) Kokkos::abort("PFLARE: ncols_local > max_nnz_local in MatCreateSubMatrix_Seq_kokkos — scratch pool overflow"); + // Allocate views directly on scratch memory // Have to use views here given alignment issues // We have of size ncols+1 to account for the exclusive scan - scratch_indices = ScratchIntView(t.team_scratch(1), ncols_local+1); - + scratch_indices = ScratchIntView(t.team_scratch(1), ncols_local+1); + // Initialize scratch Kokkos::parallel_for(Kokkos::TeamVectorRange(t, ncols_local+1), [&](const PetscInt j) { scratch_indices(j) = 0; @@ -2318,22 +2325,25 @@ PETSC_INTERN void MatCreateSubMatrix_Seq_kokkos(Mat *input_mat, PetscIntKokkosVi // Execute with scratch memory Kokkos::parallel_for(policy, KOKKOS_LAMBDA(const KokkosTeamMemberType& t) { - + // i_idx_is_row is the row index into the output const PetscInt i_idx_is_row = t.league_rank(); // i is the row index into the input - const PetscInt i = is_row_d_d(i_idx_is_row); + const PetscInt i = is_row_d_d(i_idx_is_row); // number of columns PetscInt ncols_local; ncols_local = device_local_i[i + 1] - device_local_i[i]; ScratchIntView scratch_indices; + // DIAGNOSTIC: same scratch-overflow guard as in the non-reuse kernel above. + KOKKOS_ASSERT(ncols_local <= max_nnz_local); + // Allocate views directly on scratch memory // Have to use views here given alignment issues // We have of size ncols+1 to account for the exclusive scan - scratch_indices = ScratchIntView(t.team_scratch(1), ncols_local+1); - + scratch_indices = ScratchIntView(t.team_scratch(1), ncols_local+1); + // Initialize scratch Kokkos::parallel_for(Kokkos::TeamVectorRange(t, ncols_local+1), [&](const PetscInt j) { scratch_indices(j) = 0; @@ -2464,7 +2474,7 @@ PETSC_INTERN void MatCreateSubMatrix_kokkos_view(Mat *input_mat, PetscIntKokkosV // diag Seq_kokkos kernel chain. Reuse path is unchanged (crashes are // first-call only). Toggle off (set to 0) to restore the original path. #ifndef PFLARE_ABLATE_DIAG_SUBMAT -#define PFLARE_ABLATE_DIAG_SUBMAT 1 +#define PFLARE_ABLATE_DIAG_SUBMAT 0 #endif // // The diagonal component @@ -2523,7 +2533,7 @@ PETSC_INTERN void MatCreateSubMatrix_kokkos_view(Mat *input_mat, PetscIntKokkosV // so that only the off-diag section is ablated while diag uses our Kokkos kernel. // Only the first-call (non-reuse) path is ablated, matching the observed failure mode. #ifndef PFLARE_ABLATE_OFFDIAG_SUBMAT -#define PFLARE_ABLATE_OFFDIAG_SUBMAT 1 +#define PFLARE_ABLATE_OFFDIAG_SUBMAT 0 #endif if (mpi) From dd941a68f73860c7b83bdc6bd3f1875532d49447 Mon Sep 17 00:00:00 2001 From: sdargavi Date: Wed, 8 Apr 2026 21:18:09 +0100 Subject: [PATCH 45/60] Reenable --- src/PETSc_Helperk.kokkos.cxx | 94 ++++++++++++++++++------------------ 1 file changed, 47 insertions(+), 47 deletions(-) diff --git a/src/PETSc_Helperk.kokkos.cxx b/src/PETSc_Helperk.kokkos.cxx index 830b2a5f..f15c74bd 100644 --- a/src/PETSc_Helperk.kokkos.cxx +++ b/src/PETSc_Helperk.kokkos.cxx @@ -2477,51 +2477,51 @@ PETSC_INTERN void MatCreateSubMatrix_kokkos_view(Mat *input_mat, PetscIntKokkosV #define PFLARE_ABLATE_DIAG_SUBMAT 0 #endif -// // The diagonal component -// #if PFLARE_ABLATE_DIAG_SUBMAT -// if (!reuse_int) -// { -// // Pull the (already-local) is_row / is_col indices back to the host so -// // PETSc's CPU MatCreateSubMatrix can consume them. mat_local is a -// // SeqAIJKokkos but PETSc's MatCreateSubMatrix dispatches to the host -// // SeqAIJ implementation, producing a SeqAIJ result that we then convert -// // back to SeqAIJKokkos for the downstream MatCreateMPIAIJWithSeqAIJ. -// const PetscInt n_row_h = is_row_d_d.extent(0); -// const PetscInt n_col_h = is_col_d_d.extent(0); -// PetscInt *is_row_host_arr = NULL, *is_col_host_arr = NULL; -// PetscCallVoid(PetscMalloc1(n_row_h > 0 ? n_row_h : 1, &is_row_host_arr)); -// PetscCallVoid(PetscMalloc1(n_col_h > 0 ? n_col_h : 1, &is_col_host_arr)); -// PetscIntKokkosViewHost is_row_h_view(is_row_host_arr, n_row_h); -// PetscIntKokkosViewHost is_col_h_view(is_col_host_arr, n_col_h); -// Kokkos::deep_copy(exec, is_row_h_view, is_row_d_d); -// Kokkos::deep_copy(exec, is_col_h_view, is_col_d_d); -// Kokkos::fence(); - -// IS is_row_temp = NULL, is_col_temp = NULL; -// PetscCallVoid(ISCreateGeneral(PETSC_COMM_SELF, n_row_h, is_row_host_arr, PETSC_COPY_VALUES, &is_row_temp)); -// PetscCallVoid(ISCreateGeneral(PETSC_COMM_SELF, n_col_h, is_col_host_arr, PETSC_COPY_VALUES, &is_col_temp)); - -// Mat tmp_host_mat = NULL; -// PetscCallVoid(MatCreateSubMatrix(mat_local, is_row_temp, is_col_temp, MAT_INITIAL_MATRIX, &output_mat_local)); -// // Convert the SeqAIJ host result to SeqAIJKokkos so the downstream -// // MatCreateMPIAIJWithSeqAIJ + reuse storage hand-off still get a Kokkos -// // seq block (matches what MatCreateSubMatrix_Seq_kokkos would have -// // produced). -// //PetscCallVoid(MatConvert(tmp_host_mat, MATSEQAIJKOKKOS, MAT_INITIAL_MATRIX, &output_mat_local)); - -// //PetscCallVoid(MatDestroy(&tmp_host_mat)); -// PetscCallVoid(ISDestroy(&is_row_temp)); -// PetscCallVoid(ISDestroy(&is_col_temp)); -// PetscCallVoid(PetscFree(is_row_host_arr)); -// PetscCallVoid(PetscFree(is_col_host_arr)); -// } -// else -// { -// MatCreateSubMatrix_Seq_kokkos(&mat_local, is_row_d_d, is_col_d_d, reuse_int, &output_mat_local); -// } -// #else -// MatCreateSubMatrix_Seq_kokkos(&mat_local, is_row_d_d, is_col_d_d, reuse_int, &output_mat_local); -// #endif + // The diagonal component +#if PFLARE_ABLATE_DIAG_SUBMAT + if (!reuse_int) + { + // Pull the (already-local) is_row / is_col indices back to the host so + // PETSc's CPU MatCreateSubMatrix can consume them. mat_local is a + // SeqAIJKokkos but PETSc's MatCreateSubMatrix dispatches to the host + // SeqAIJ implementation, producing a SeqAIJ result that we then convert + // back to SeqAIJKokkos for the downstream MatCreateMPIAIJWithSeqAIJ. + const PetscInt n_row_h = is_row_d_d.extent(0); + const PetscInt n_col_h = is_col_d_d.extent(0); + PetscInt *is_row_host_arr = NULL, *is_col_host_arr = NULL; + PetscCallVoid(PetscMalloc1(n_row_h > 0 ? n_row_h : 1, &is_row_host_arr)); + PetscCallVoid(PetscMalloc1(n_col_h > 0 ? n_col_h : 1, &is_col_host_arr)); + PetscIntKokkosViewHost is_row_h_view(is_row_host_arr, n_row_h); + PetscIntKokkosViewHost is_col_h_view(is_col_host_arr, n_col_h); + Kokkos::deep_copy(exec, is_row_h_view, is_row_d_d); + Kokkos::deep_copy(exec, is_col_h_view, is_col_d_d); + Kokkos::fence(); + + IS is_row_temp = NULL, is_col_temp = NULL; + PetscCallVoid(ISCreateGeneral(PETSC_COMM_SELF, n_row_h, is_row_host_arr, PETSC_COPY_VALUES, &is_row_temp)); + PetscCallVoid(ISCreateGeneral(PETSC_COMM_SELF, n_col_h, is_col_host_arr, PETSC_COPY_VALUES, &is_col_temp)); + + Mat tmp_host_mat = NULL; + PetscCallVoid(MatCreateSubMatrix(mat_local, is_row_temp, is_col_temp, MAT_INITIAL_MATRIX, &output_mat_local)); + // Convert the SeqAIJ host result to SeqAIJKokkos so the downstream + // MatCreateMPIAIJWithSeqAIJ + reuse storage hand-off still get a Kokkos + // seq block (matches what MatCreateSubMatrix_Seq_kokkos would have + // produced). + //PetscCallVoid(MatConvert(tmp_host_mat, MATSEQAIJKOKKOS, MAT_INITIAL_MATRIX, &output_mat_local)); + + //PetscCallVoid(MatDestroy(&tmp_host_mat)); + PetscCallVoid(ISDestroy(&is_row_temp)); + PetscCallVoid(ISDestroy(&is_col_temp)); + PetscCallVoid(PetscFree(is_row_host_arr)); + PetscCallVoid(PetscFree(is_col_host_arr)); + } + else + { + MatCreateSubMatrix_Seq_kokkos(&mat_local, is_row_d_d, is_col_d_d, reuse_int, &output_mat_local); + } +#else + MatCreateSubMatrix_Seq_kokkos(&mat_local, is_row_d_d, is_col_d_d, reuse_int, &output_mat_local); +#endif // The off-diagonal component requires some comms // Basically a copy of MatCreateSubMatrix_MPIAIJ_SameRowColDist @@ -2829,11 +2829,11 @@ PETSC_INTERN void MatCreateSubMatrix_kokkos_view(Mat *input_mat, PetscIntKokkosV PetscCallVoid(PetscLogGpuToCpu(bytes)); // Now create an IS IS iscol_o; - PetscCallVoid(ISCreateGeneral(PETSC_COMM_SELF, is_col_o_h.extent(0), is_col_o_host, PETSC_OWN_POINTER, &iscol_o)); + PetscCallVoid(ISCreateGeneral(PETSC_COMM_SELF, is_col_o_h.extent(0), is_col_o_host, PETSC_COPY_VALUES, &iscol_o)); // Register it with the output_mat PetscCallVoid(PetscObjectCompose((PetscObject)(*output_mat), "iscol_o", (PetscObject)iscol_o)); // The ref counter is incremented by the compose - PetscCallVoid(ISDestroy(&iscol_o)); + //PetscCallVoid(ISDestroy(&iscol_o)); std::cerr << "nine " << std::endl; From 5f0e04bc525bb196770ddfee62a6409fd42e84fb Mon Sep 17 00:00:00 2001 From: sdargavi Date: Thu, 9 Apr 2026 21:02:56 +0100 Subject: [PATCH 46/60] C version of petsc cpu --- src/PETSc_Helperk.kokkos.cxx | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/PETSc_Helperk.kokkos.cxx b/src/PETSc_Helperk.kokkos.cxx index f15c74bd..1dd750ff 100644 --- a/src/PETSc_Helperk.kokkos.cxx +++ b/src/PETSc_Helperk.kokkos.cxx @@ -2860,6 +2860,10 @@ PETSC_INTERN void MatCreateSubMatrix_kokkos(Mat *input_mat, IS *is_row, IS *is_c const int our_level, const int is_row_fine_int, const int is_col_fine_int) { PflareKokkosTrace _trace("MatCreateSubMatrix_kokkos"); + + PetscCallVoid(MatCreateSubMatrix(*input_mat, *is_row, *is_col, MAT_INITIAL_MATRIX, output_mat)); + return; + PetscInt global_row_start, global_row_end_plus_one; PetscInt global_col_start, global_col_end_plus_one; PetscCallVoid(MatGetOwnershipRange(*input_mat, &global_row_start, &global_row_end_plus_one)); From 3564898a9833bcc73a63d9a8cef245a80ce4ee60 Mon Sep 17 00:00:00 2001 From: sdargavi Date: Thu, 9 Apr 2026 21:20:01 +0100 Subject: [PATCH 47/60] IS check --- src/PETSc_Helperk.kokkos.cxx | 113 +++++++++++++++++++---------------- 1 file changed, 63 insertions(+), 50 deletions(-) diff --git a/src/PETSc_Helperk.kokkos.cxx b/src/PETSc_Helperk.kokkos.cxx index 1dd750ff..624f34a0 100644 --- a/src/PETSc_Helperk.kokkos.cxx +++ b/src/PETSc_Helperk.kokkos.cxx @@ -2424,7 +2424,7 @@ PETSC_INTERN void MatCreateSubMatrix_Seq_kokkos(Mat *input_mat, PetscIntKokkosVi // is_col must be sorted // This one uses the views is_row_d_d and is_col_d_d directly, rewritten to be the local indices PETSC_INTERN void MatCreateSubMatrix_kokkos_view(Mat *input_mat, PetscIntKokkosView &is_row_d_d, PetscInt global_rows_row, \ - PetscIntKokkosView &is_col_d_d, PetscInt global_cols_col, const int reuse_int, Mat *output_mat) + PetscIntKokkosView &is_col_d_d, PetscInt global_cols_col, const int reuse_int, Mat *output_mat, IS *rows_rows, IS *cols_cols) { PflareKokkosTrace _trace("MatCreateSubMatrix_kokkos_view"); PetscInt local_rows, local_cols; @@ -2477,51 +2477,51 @@ PETSC_INTERN void MatCreateSubMatrix_kokkos_view(Mat *input_mat, PetscIntKokkosV #define PFLARE_ABLATE_DIAG_SUBMAT 0 #endif - // The diagonal component -#if PFLARE_ABLATE_DIAG_SUBMAT - if (!reuse_int) - { - // Pull the (already-local) is_row / is_col indices back to the host so - // PETSc's CPU MatCreateSubMatrix can consume them. mat_local is a - // SeqAIJKokkos but PETSc's MatCreateSubMatrix dispatches to the host - // SeqAIJ implementation, producing a SeqAIJ result that we then convert - // back to SeqAIJKokkos for the downstream MatCreateMPIAIJWithSeqAIJ. - const PetscInt n_row_h = is_row_d_d.extent(0); - const PetscInt n_col_h = is_col_d_d.extent(0); - PetscInt *is_row_host_arr = NULL, *is_col_host_arr = NULL; - PetscCallVoid(PetscMalloc1(n_row_h > 0 ? n_row_h : 1, &is_row_host_arr)); - PetscCallVoid(PetscMalloc1(n_col_h > 0 ? n_col_h : 1, &is_col_host_arr)); - PetscIntKokkosViewHost is_row_h_view(is_row_host_arr, n_row_h); - PetscIntKokkosViewHost is_col_h_view(is_col_host_arr, n_col_h); - Kokkos::deep_copy(exec, is_row_h_view, is_row_d_d); - Kokkos::deep_copy(exec, is_col_h_view, is_col_d_d); - Kokkos::fence(); - - IS is_row_temp = NULL, is_col_temp = NULL; - PetscCallVoid(ISCreateGeneral(PETSC_COMM_SELF, n_row_h, is_row_host_arr, PETSC_COPY_VALUES, &is_row_temp)); - PetscCallVoid(ISCreateGeneral(PETSC_COMM_SELF, n_col_h, is_col_host_arr, PETSC_COPY_VALUES, &is_col_temp)); - - Mat tmp_host_mat = NULL; - PetscCallVoid(MatCreateSubMatrix(mat_local, is_row_temp, is_col_temp, MAT_INITIAL_MATRIX, &output_mat_local)); - // Convert the SeqAIJ host result to SeqAIJKokkos so the downstream - // MatCreateMPIAIJWithSeqAIJ + reuse storage hand-off still get a Kokkos - // seq block (matches what MatCreateSubMatrix_Seq_kokkos would have - // produced). - //PetscCallVoid(MatConvert(tmp_host_mat, MATSEQAIJKOKKOS, MAT_INITIAL_MATRIX, &output_mat_local)); - - //PetscCallVoid(MatDestroy(&tmp_host_mat)); - PetscCallVoid(ISDestroy(&is_row_temp)); - PetscCallVoid(ISDestroy(&is_col_temp)); - PetscCallVoid(PetscFree(is_row_host_arr)); - PetscCallVoid(PetscFree(is_col_host_arr)); - } - else - { - MatCreateSubMatrix_Seq_kokkos(&mat_local, is_row_d_d, is_col_d_d, reuse_int, &output_mat_local); - } -#else - MatCreateSubMatrix_Seq_kokkos(&mat_local, is_row_d_d, is_col_d_d, reuse_int, &output_mat_local); -#endif +// // The diagonal component +// #if PFLARE_ABLATE_DIAG_SUBMAT +// if (!reuse_int) +// { +// // Pull the (already-local) is_row / is_col indices back to the host so +// // PETSc's CPU MatCreateSubMatrix can consume them. mat_local is a +// // SeqAIJKokkos but PETSc's MatCreateSubMatrix dispatches to the host +// // SeqAIJ implementation, producing a SeqAIJ result that we then convert +// // back to SeqAIJKokkos for the downstream MatCreateMPIAIJWithSeqAIJ. +// const PetscInt n_row_h = is_row_d_d.extent(0); +// const PetscInt n_col_h = is_col_d_d.extent(0); +// PetscInt *is_row_host_arr = NULL, *is_col_host_arr = NULL; +// PetscCallVoid(PetscMalloc1(n_row_h > 0 ? n_row_h : 1, &is_row_host_arr)); +// PetscCallVoid(PetscMalloc1(n_col_h > 0 ? n_col_h : 1, &is_col_host_arr)); +// PetscIntKokkosViewHost is_row_h_view(is_row_host_arr, n_row_h); +// PetscIntKokkosViewHost is_col_h_view(is_col_host_arr, n_col_h); +// Kokkos::deep_copy(exec, is_row_h_view, is_row_d_d); +// Kokkos::deep_copy(exec, is_col_h_view, is_col_d_d); +// Kokkos::fence(); + +// IS is_row_temp = NULL, is_col_temp = NULL; +// PetscCallVoid(ISCreateGeneral(PETSC_COMM_SELF, n_row_h, is_row_host_arr, PETSC_COPY_VALUES, &is_row_temp)); +// PetscCallVoid(ISCreateGeneral(PETSC_COMM_SELF, n_col_h, is_col_host_arr, PETSC_COPY_VALUES, &is_col_temp)); + +// Mat tmp_host_mat = NULL; +// PetscCallVoid(MatCreateSubMatrix(mat_local, is_row_temp, is_col_temp, MAT_INITIAL_MATRIX, &output_mat_local)); +// // Convert the SeqAIJ host result to SeqAIJKokkos so the downstream +// // MatCreateMPIAIJWithSeqAIJ + reuse storage hand-off still get a Kokkos +// // seq block (matches what MatCreateSubMatrix_Seq_kokkos would have +// // produced). +// //PetscCallVoid(MatConvert(tmp_host_mat, MATSEQAIJKOKKOS, MAT_INITIAL_MATRIX, &output_mat_local)); + +// //PetscCallVoid(MatDestroy(&tmp_host_mat)); +// PetscCallVoid(ISDestroy(&is_row_temp)); +// PetscCallVoid(ISDestroy(&is_col_temp)); +// PetscCallVoid(PetscFree(is_row_host_arr)); +// PetscCallVoid(PetscFree(is_col_host_arr)); +// } +// else +// { +// MatCreateSubMatrix_Seq_kokkos(&mat_local, is_row_d_d, is_col_d_d, reuse_int, &output_mat_local); +// } +// #else +// MatCreateSubMatrix_Seq_kokkos(&mat_local, is_row_d_d, is_col_d_d, reuse_int, &output_mat_local); +// #endif // The off-diagonal component requires some comms // Basically a copy of MatCreateSubMatrix_MPIAIJ_SameRowColDist @@ -2533,7 +2533,7 @@ PETSC_INTERN void MatCreateSubMatrix_kokkos_view(Mat *input_mat, PetscIntKokkosV // so that only the off-diag section is ablated while diag uses our Kokkos kernel. // Only the first-call (non-reuse) path is ablated, matching the observed failure mode. #ifndef PFLARE_ABLATE_OFFDIAG_SUBMAT -#define PFLARE_ABLATE_OFFDIAG_SUBMAT 0 +#define PFLARE_ABLATE_OFFDIAG_SUBMAT 1 #endif if (mpi) @@ -2568,6 +2568,19 @@ PETSC_INTERN void MatCreateSubMatrix_kokkos_view(Mat *input_mat, PetscIntKokkosV PetscCallVoid(ISCreateGeneral(MPI_COMM_MATRIX, n_row_abl, is_row_g_arr, PETSC_OWN_POINTER, &is_row_g_abl)); PetscCallVoid(ISCreateGeneral(MPI_COMM_MATRIX, n_col_abl, is_col_g_arr, PETSC_OWN_POINTER, &is_col_g_abl)); + PetscBool equal_flag; + PetscCallVoid(ISEqualUnsorted(is_row_g_abl, *rows_rows, &equal_flag)); + +PetscCheckAbort(equal_flag, MPI_COMM_MATRIX, + PETSC_ERR_PLIB, + "rows not equal"); + + PetscCallVoid(ISEqualUnsorted(is_col_g_abl, *cols_cols, &equal_flag)); + +PetscCheckAbort(equal_flag, MPI_COMM_MATRIX, + PETSC_ERR_PLIB, + "cols not equal"); + Mat tmp_abl = NULL; PetscCallVoid(MatCreateSubMatrix(*input_mat, is_row_g_abl, is_col_g_abl, MAT_INITIAL_MATRIX, output_mat)); //PetscCallVoid(MatConvert(tmp_abl, MATMPIAIJKOKKOS, MAT_INITIAL_MATRIX, output_mat)); @@ -2861,8 +2874,8 @@ PETSC_INTERN void MatCreateSubMatrix_kokkos(Mat *input_mat, IS *is_row, IS *is_c { PflareKokkosTrace _trace("MatCreateSubMatrix_kokkos"); - PetscCallVoid(MatCreateSubMatrix(*input_mat, *is_row, *is_col, MAT_INITIAL_MATRIX, output_mat)); - return; + // PetscCallVoid(MatCreateSubMatrix(*input_mat, *is_row, *is_col, MAT_INITIAL_MATRIX, output_mat)); + // return; PetscInt global_row_start, global_row_end_plus_one; PetscInt global_col_start, global_col_end_plus_one; @@ -2949,7 +2962,7 @@ PETSC_INTERN void MatCreateSubMatrix_kokkos(Mat *input_mat, IS *is_row, IS *is_c } } - MatCreateSubMatrix_kokkos_view(input_mat, is_row_d_d, global_rows_row, is_col_d_d, global_cols_col, reuse_int, output_mat); + MatCreateSubMatrix_kokkos_view(input_mat, is_row_d_d, global_rows_row, is_col_d_d, global_cols_col, reuse_int, output_mat, is_row, is_col); return; } From 81fa70b3a8d2e63d4ebc786eb0baa26596999c57 Mon Sep 17 00:00:00 2001 From: sdargavi Date: Thu, 9 Apr 2026 23:38:39 +0100 Subject: [PATCH 48/60] Move cpu call --- src/PETSc_Helperk.kokkos.cxx | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/PETSc_Helperk.kokkos.cxx b/src/PETSc_Helperk.kokkos.cxx index 624f34a0..8da36b1e 100644 --- a/src/PETSc_Helperk.kokkos.cxx +++ b/src/PETSc_Helperk.kokkos.cxx @@ -2874,9 +2874,6 @@ PETSC_INTERN void MatCreateSubMatrix_kokkos(Mat *input_mat, IS *is_row, IS *is_c { PflareKokkosTrace _trace("MatCreateSubMatrix_kokkos"); - // PetscCallVoid(MatCreateSubMatrix(*input_mat, *is_row, *is_col, MAT_INITIAL_MATRIX, output_mat)); - // return; - PetscInt global_row_start, global_row_end_plus_one; PetscInt global_col_start, global_col_end_plus_one; PetscCallVoid(MatGetOwnershipRange(*input_mat, &global_row_start, &global_row_end_plus_one)); @@ -2962,6 +2959,9 @@ PETSC_INTERN void MatCreateSubMatrix_kokkos(Mat *input_mat, IS *is_row, IS *is_c } } + PetscCallVoid(MatCreateSubMatrix(*input_mat, *is_row, *is_col, MAT_INITIAL_MATRIX, output_mat)); + return; + MatCreateSubMatrix_kokkos_view(input_mat, is_row_d_d, global_rows_row, is_col_d_d, global_cols_col, reuse_int, output_mat, is_row, is_col); return; From fec7243d30f91dcb96dec294fd4cfadd94d300c9 Mon Sep 17 00:00:00 2001 From: sdargavi Date: Thu, 9 Apr 2026 23:47:05 +0100 Subject: [PATCH 49/60] Change location --- src/PETSc_Helperk.kokkos.cxx | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/src/PETSc_Helperk.kokkos.cxx b/src/PETSc_Helperk.kokkos.cxx index 8da36b1e..b4e2a58a 100644 --- a/src/PETSc_Helperk.kokkos.cxx +++ b/src/PETSc_Helperk.kokkos.cxx @@ -2582,7 +2582,8 @@ PetscCheckAbort(equal_flag, MPI_COMM_MATRIX, "cols not equal"); Mat tmp_abl = NULL; - PetscCallVoid(MatCreateSubMatrix(*input_mat, is_row_g_abl, is_col_g_abl, MAT_INITIAL_MATRIX, output_mat)); + //PetscCallVoid(MatCreateSubMatrix(*input_mat, is_row_g_abl, is_col_g_abl, MAT_INITIAL_MATRIX, output_mat)); + PetscCallVoid(MatCreateSubMatrix(*input_mat, *rows_rows, *cols_cols, MAT_INITIAL_MATRIX, output_mat)); //PetscCallVoid(MatConvert(tmp_abl, MATMPIAIJKOKKOS, MAT_INITIAL_MATRIX, output_mat)); //PetscCallVoid(MatDestroy(&tmp_abl)); //PetscCallVoid(MatDestroy(&output_mat_local)); // diag mat no longer needed @@ -2959,8 +2960,8 @@ PETSC_INTERN void MatCreateSubMatrix_kokkos(Mat *input_mat, IS *is_row, IS *is_c } } - PetscCallVoid(MatCreateSubMatrix(*input_mat, *is_row, *is_col, MAT_INITIAL_MATRIX, output_mat)); - return; + // PetscCallVoid(MatCreateSubMatrix(*input_mat, *is_row, *is_col, MAT_INITIAL_MATRIX, output_mat)); + // return; MatCreateSubMatrix_kokkos_view(input_mat, is_row_d_d, global_rows_row, is_col_d_d, global_cols_col, reuse_int, output_mat, is_row, is_col); From 84c292efcb953351262b525677210c23685275b1 Mon Sep 17 00:00:00 2001 From: sdargavi Date: Thu, 9 Apr 2026 23:53:43 +0100 Subject: [PATCH 50/60] Remove extra --- src/PETSc_Helperk.kokkos.cxx | 80 ++++++++++++++++++------------------ 1 file changed, 40 insertions(+), 40 deletions(-) diff --git a/src/PETSc_Helperk.kokkos.cxx b/src/PETSc_Helperk.kokkos.cxx index b4e2a58a..41b28d8a 100644 --- a/src/PETSc_Helperk.kokkos.cxx +++ b/src/PETSc_Helperk.kokkos.cxx @@ -2541,54 +2541,54 @@ PETSC_INTERN void MatCreateSubMatrix_kokkos_view(Mat *input_mat, PetscIntKokkosV #if PFLARE_ABLATE_OFFDIAG_SUBMAT if (!reuse_int) { - // We need global IS indices (is_row/is_col on device are already LOCAL, - // i.e. row_global - global_row_start; add back the offset before calling - // PETSc's CPU MatCreateSubMatrix which expects global indices). - PetscInt global_row_start_abl = 0, global_row_end_abl = 0; - PetscInt global_col_start_abl = 0, global_col_end_abl = 0; - PetscCallVoid(MatGetOwnershipRange(*input_mat, &global_row_start_abl, &global_row_end_abl)); - PetscCallVoid(MatGetOwnershipRangeColumn(*input_mat, &global_col_start_abl, &global_col_end_abl)); - - const PetscInt n_row_abl = (PetscInt)is_row_d_d.extent(0); - const PetscInt n_col_abl = (PetscInt)is_col_d_d.extent(0); - PetscInt *is_row_g_arr = NULL, *is_col_g_arr = NULL; - PetscCallVoid(PetscMalloc1(n_row_abl > 0 ? n_row_abl : 1, &is_row_g_arr)); - PetscCallVoid(PetscMalloc1(n_col_abl > 0 ? n_col_abl : 1, &is_col_g_arr)); - - // Copy local device indices to host then shift back to global. - PetscIntKokkosViewHost is_row_g_h(is_row_g_arr, n_row_abl); - PetscIntKokkosViewHost is_col_g_h(is_col_g_arr, n_col_abl); - Kokkos::deep_copy(exec, is_row_g_h, is_row_d_d); - Kokkos::deep_copy(exec, is_col_g_h, is_col_d_d); - Kokkos::fence(); - for (PetscInt ii = 0; ii < n_row_abl; ii++) is_row_g_arr[ii] += global_row_start_abl; - for (PetscInt ii = 0; ii < n_col_abl; ii++) is_col_g_arr[ii] += global_col_start_abl; - - IS is_row_g_abl = NULL, is_col_g_abl = NULL; - PetscCallVoid(ISCreateGeneral(MPI_COMM_MATRIX, n_row_abl, is_row_g_arr, PETSC_OWN_POINTER, &is_row_g_abl)); - PetscCallVoid(ISCreateGeneral(MPI_COMM_MATRIX, n_col_abl, is_col_g_arr, PETSC_OWN_POINTER, &is_col_g_abl)); - - PetscBool equal_flag; - PetscCallVoid(ISEqualUnsorted(is_row_g_abl, *rows_rows, &equal_flag)); - -PetscCheckAbort(equal_flag, MPI_COMM_MATRIX, - PETSC_ERR_PLIB, - "rows not equal"); +// // We need global IS indices (is_row/is_col on device are already LOCAL, +// // i.e. row_global - global_row_start; add back the offset before calling +// // PETSc's CPU MatCreateSubMatrix which expects global indices). +// PetscInt global_row_start_abl = 0, global_row_end_abl = 0; +// PetscInt global_col_start_abl = 0, global_col_end_abl = 0; +// PetscCallVoid(MatGetOwnershipRange(*input_mat, &global_row_start_abl, &global_row_end_abl)); +// PetscCallVoid(MatGetOwnershipRangeColumn(*input_mat, &global_col_start_abl, &global_col_end_abl)); + +// const PetscInt n_row_abl = (PetscInt)is_row_d_d.extent(0); +// const PetscInt n_col_abl = (PetscInt)is_col_d_d.extent(0); +// PetscInt *is_row_g_arr = NULL, *is_col_g_arr = NULL; +// PetscCallVoid(PetscMalloc1(n_row_abl > 0 ? n_row_abl : 1, &is_row_g_arr)); +// PetscCallVoid(PetscMalloc1(n_col_abl > 0 ? n_col_abl : 1, &is_col_g_arr)); + +// // Copy local device indices to host then shift back to global. +// PetscIntKokkosViewHost is_row_g_h(is_row_g_arr, n_row_abl); +// PetscIntKokkosViewHost is_col_g_h(is_col_g_arr, n_col_abl); +// Kokkos::deep_copy(exec, is_row_g_h, is_row_d_d); +// Kokkos::deep_copy(exec, is_col_g_h, is_col_d_d); +// Kokkos::fence(); +// for (PetscInt ii = 0; ii < n_row_abl; ii++) is_row_g_arr[ii] += global_row_start_abl; +// for (PetscInt ii = 0; ii < n_col_abl; ii++) is_col_g_arr[ii] += global_col_start_abl; + +// IS is_row_g_abl = NULL, is_col_g_abl = NULL; +// PetscCallVoid(ISCreateGeneral(MPI_COMM_MATRIX, n_row_abl, is_row_g_arr, PETSC_OWN_POINTER, &is_row_g_abl)); +// PetscCallVoid(ISCreateGeneral(MPI_COMM_MATRIX, n_col_abl, is_col_g_arr, PETSC_OWN_POINTER, &is_col_g_abl)); + +// PetscBool equal_flag; +// PetscCallVoid(ISEqualUnsorted(is_row_g_abl, *rows_rows, &equal_flag)); + +// PetscCheckAbort(equal_flag, MPI_COMM_MATRIX, +// PETSC_ERR_PLIB, +// "rows not equal"); - PetscCallVoid(ISEqualUnsorted(is_col_g_abl, *cols_cols, &equal_flag)); +// PetscCallVoid(ISEqualUnsorted(is_col_g_abl, *cols_cols, &equal_flag)); -PetscCheckAbort(equal_flag, MPI_COMM_MATRIX, - PETSC_ERR_PLIB, - "cols not equal"); +// PetscCheckAbort(equal_flag, MPI_COMM_MATRIX, +// PETSC_ERR_PLIB, +// "cols not equal"); - Mat tmp_abl = NULL; +// Mat tmp_abl = NULL; //PetscCallVoid(MatCreateSubMatrix(*input_mat, is_row_g_abl, is_col_g_abl, MAT_INITIAL_MATRIX, output_mat)); PetscCallVoid(MatCreateSubMatrix(*input_mat, *rows_rows, *cols_cols, MAT_INITIAL_MATRIX, output_mat)); //PetscCallVoid(MatConvert(tmp_abl, MATMPIAIJKOKKOS, MAT_INITIAL_MATRIX, output_mat)); //PetscCallVoid(MatDestroy(&tmp_abl)); //PetscCallVoid(MatDestroy(&output_mat_local)); // diag mat no longer needed - PetscCallVoid(ISDestroy(&is_row_g_abl)); - PetscCallVoid(ISDestroy(&is_col_g_abl)); + //PetscCallVoid(ISDestroy(&is_row_g_abl)); + //PetscCallVoid(ISDestroy(&is_col_g_abl)); return; } #endif From 2170ed3b838c102f2580090b6ccffdd0266b9361 Mon Sep 17 00:00:00 2001 From: sdargavi Date: Fri, 10 Apr 2026 00:00:24 +0100 Subject: [PATCH 51/60] Comment out useless --- src/PETSc_Helperk.kokkos.cxx | 47 ++++++++++++++++++------------------ 1 file changed, 24 insertions(+), 23 deletions(-) diff --git a/src/PETSc_Helperk.kokkos.cxx b/src/PETSc_Helperk.kokkos.cxx index 41b28d8a..16a4a452 100644 --- a/src/PETSc_Helperk.kokkos.cxx +++ b/src/PETSc_Helperk.kokkos.cxx @@ -2430,41 +2430,42 @@ PETSC_INTERN void MatCreateSubMatrix_kokkos_view(Mat *input_mat, PetscIntKokkosV PetscInt local_rows, local_cols; PetscInt global_rows, global_cols; PetscInt global_row_start, global_row_end_plus_one; - PetscCallVoid(MatGetOwnershipRange(*input_mat, &global_row_start, &global_row_end_plus_one)); + // PetscCallVoid(MatGetOwnershipRange(*input_mat, &global_row_start, &global_row_end_plus_one)); PetscInt local_cols_col = is_col_d_d.extent(0); auto exec = PetscGetKokkosExecutionSpace(); - // Are we in parallel? - MatType mat_type; + // // Are we in parallel? + // MatType mat_type; MPI_Comm MPI_COMM_MATRIX; - PetscCallVoid(MatGetType(*input_mat, &mat_type)); + // PetscCallVoid(MatGetType(*input_mat, &mat_type)); - const bool mpi = strcmp(mat_type, MATMPIAIJKOKKOS) == 0; - PetscCallVoid(PetscObjectGetComm((PetscObject)*input_mat, &MPI_COMM_MATRIX)); - PetscCallVoid(MatGetSize(*input_mat, &global_rows, &global_cols)); - PetscCallVoid(MatGetLocalSize(*input_mat, &local_rows, &local_cols)); + // const bool mpi = strcmp(mat_type, MATMPIAIJKOKKOS) == 0; + const bool mpi = true; + // PetscCallVoid(PetscObjectGetComm((PetscObject)*input_mat, &MPI_COMM_MATRIX)); + // PetscCallVoid(MatGetSize(*input_mat, &global_rows, &global_cols)); + // PetscCallVoid(MatGetLocalSize(*input_mat, &local_rows, &local_cols)); Mat_MPIAIJ *mat_mpi = nullptr; Mat mat_local = NULL, mat_nonlocal = NULL; Mat output_mat_local, output_mat_nonlocal; PetscInt rows_ao, cols_ao; - if (mpi) - { - mat_mpi = (Mat_MPIAIJ *)(*input_mat)->data; - PetscCallVoid(MatMPIAIJGetSeqAIJ(*input_mat, &mat_local, &mat_nonlocal, NULL)); - PetscCallVoid(MatGetSize(mat_nonlocal, &rows_ao, &cols_ao)); + // if (mpi) + // { + // mat_mpi = (Mat_MPIAIJ *)(*input_mat)->data; + // PetscCallVoid(MatMPIAIJGetSeqAIJ(*input_mat, &mat_local, &mat_nonlocal, NULL)); + // PetscCallVoid(MatGetSize(mat_nonlocal, &rows_ao, &cols_ao)); - if (reuse_int) - { - PetscCallVoid(MatMPIAIJGetSeqAIJ(*output_mat, &output_mat_local, &output_mat_nonlocal, NULL)); - } - } - else - { - mat_local = *input_mat; - if (reuse_int) output_mat_local = *output_mat; - } + // if (reuse_int) + // { + // PetscCallVoid(MatMPIAIJGetSeqAIJ(*output_mat, &output_mat_local, &output_mat_nonlocal, NULL)); + // } + // } + // else + // { + // mat_local = *input_mat; + // if (reuse_int) output_mat_local = *output_mat; + // } size_t bytes = 0; // Ablation toggle (Step 2 of plan): when defined non-zero, the diagonal From 0e34228680dd3e3187794c9e699e005b48d230de Mon Sep 17 00:00:00 2001 From: sdargavi Date: Fri, 10 Apr 2026 00:06:04 +0100 Subject: [PATCH 52/60] Check everything disabled --- src/PETSc_Helperk.kokkos.cxx | 826 +++++++++++++++++------------------ 1 file changed, 413 insertions(+), 413 deletions(-) diff --git a/src/PETSc_Helperk.kokkos.cxx b/src/PETSc_Helperk.kokkos.cxx index 16a4a452..9d577515 100644 --- a/src/PETSc_Helperk.kokkos.cxx +++ b/src/PETSc_Helperk.kokkos.cxx @@ -2426,440 +2426,440 @@ PETSC_INTERN void MatCreateSubMatrix_Seq_kokkos(Mat *input_mat, PetscIntKokkosVi PETSC_INTERN void MatCreateSubMatrix_kokkos_view(Mat *input_mat, PetscIntKokkosView &is_row_d_d, PetscInt global_rows_row, \ PetscIntKokkosView &is_col_d_d, PetscInt global_cols_col, const int reuse_int, Mat *output_mat, IS *rows_rows, IS *cols_cols) { - PflareKokkosTrace _trace("MatCreateSubMatrix_kokkos_view"); - PetscInt local_rows, local_cols; - PetscInt global_rows, global_cols; - PetscInt global_row_start, global_row_end_plus_one; - // PetscCallVoid(MatGetOwnershipRange(*input_mat, &global_row_start, &global_row_end_plus_one)); - PetscInt local_cols_col = is_col_d_d.extent(0); - auto exec = PetscGetKokkosExecutionSpace(); - - // // Are we in parallel? - // MatType mat_type; - MPI_Comm MPI_COMM_MATRIX; - // PetscCallVoid(MatGetType(*input_mat, &mat_type)); - - // const bool mpi = strcmp(mat_type, MATMPIAIJKOKKOS) == 0; - const bool mpi = true; - // PetscCallVoid(PetscObjectGetComm((PetscObject)*input_mat, &MPI_COMM_MATRIX)); - // PetscCallVoid(MatGetSize(*input_mat, &global_rows, &global_cols)); - // PetscCallVoid(MatGetLocalSize(*input_mat, &local_rows, &local_cols)); - - Mat_MPIAIJ *mat_mpi = nullptr; - Mat mat_local = NULL, mat_nonlocal = NULL; - Mat output_mat_local, output_mat_nonlocal; +// PflareKokkosTrace _trace("MatCreateSubMatrix_kokkos_view"); +// PetscInt local_rows, local_cols; +// PetscInt global_rows, global_cols; +// PetscInt global_row_start, global_row_end_plus_one; +// // PetscCallVoid(MatGetOwnershipRange(*input_mat, &global_row_start, &global_row_end_plus_one)); +// PetscInt local_cols_col = is_col_d_d.extent(0); +// auto exec = PetscGetKokkosExecutionSpace(); + +// // // Are we in parallel? +// // MatType mat_type; +// MPI_Comm MPI_COMM_MATRIX; +// // PetscCallVoid(MatGetType(*input_mat, &mat_type)); + +// // const bool mpi = strcmp(mat_type, MATMPIAIJKOKKOS) == 0; +// const bool mpi = true; +// // PetscCallVoid(PetscObjectGetComm((PetscObject)*input_mat, &MPI_COMM_MATRIX)); +// // PetscCallVoid(MatGetSize(*input_mat, &global_rows, &global_cols)); +// // PetscCallVoid(MatGetLocalSize(*input_mat, &local_rows, &local_cols)); + +// Mat_MPIAIJ *mat_mpi = nullptr; +// Mat mat_local = NULL, mat_nonlocal = NULL; +// Mat output_mat_local, output_mat_nonlocal; - PetscInt rows_ao, cols_ao; - // if (mpi) - // { - // mat_mpi = (Mat_MPIAIJ *)(*input_mat)->data; - // PetscCallVoid(MatMPIAIJGetSeqAIJ(*input_mat, &mat_local, &mat_nonlocal, NULL)); - // PetscCallVoid(MatGetSize(mat_nonlocal, &rows_ao, &cols_ao)); +// PetscInt rows_ao, cols_ao; +// // if (mpi) +// // { +// // mat_mpi = (Mat_MPIAIJ *)(*input_mat)->data; +// // PetscCallVoid(MatMPIAIJGetSeqAIJ(*input_mat, &mat_local, &mat_nonlocal, NULL)); +// // PetscCallVoid(MatGetSize(mat_nonlocal, &rows_ao, &cols_ao)); - // if (reuse_int) - // { - // PetscCallVoid(MatMPIAIJGetSeqAIJ(*output_mat, &output_mat_local, &output_mat_nonlocal, NULL)); - // } - // } - // else - // { - // mat_local = *input_mat; - // if (reuse_int) output_mat_local = *output_mat; - // } - size_t bytes = 0; - -// Ablation toggle (Step 2 of plan): when defined non-zero, the diagonal -// MatCreateSubMatrix_Seq_kokkos call is replaced by PETSc's host-side -// MatCreateSubMatrix on mat_local plus a MatConvert back to MATSEQAIJKOKKOS. -// Used to test whether the intermittent GPU crash originates inside the -// diag Seq_kokkos kernel chain. Reuse path is unchanged (crashes are -// first-call only). Toggle off (set to 0) to restore the original path. -#ifndef PFLARE_ABLATE_DIAG_SUBMAT -#define PFLARE_ABLATE_DIAG_SUBMAT 0 -#endif - -// // The diagonal component -// #if PFLARE_ABLATE_DIAG_SUBMAT -// if (!reuse_int) -// { -// // Pull the (already-local) is_row / is_col indices back to the host so -// // PETSc's CPU MatCreateSubMatrix can consume them. mat_local is a -// // SeqAIJKokkos but PETSc's MatCreateSubMatrix dispatches to the host -// // SeqAIJ implementation, producing a SeqAIJ result that we then convert -// // back to SeqAIJKokkos for the downstream MatCreateMPIAIJWithSeqAIJ. -// const PetscInt n_row_h = is_row_d_d.extent(0); -// const PetscInt n_col_h = is_col_d_d.extent(0); -// PetscInt *is_row_host_arr = NULL, *is_col_host_arr = NULL; -// PetscCallVoid(PetscMalloc1(n_row_h > 0 ? n_row_h : 1, &is_row_host_arr)); -// PetscCallVoid(PetscMalloc1(n_col_h > 0 ? n_col_h : 1, &is_col_host_arr)); -// PetscIntKokkosViewHost is_row_h_view(is_row_host_arr, n_row_h); -// PetscIntKokkosViewHost is_col_h_view(is_col_host_arr, n_col_h); -// Kokkos::deep_copy(exec, is_row_h_view, is_row_d_d); -// Kokkos::deep_copy(exec, is_col_h_view, is_col_d_d); -// Kokkos::fence(); - -// IS is_row_temp = NULL, is_col_temp = NULL; -// PetscCallVoid(ISCreateGeneral(PETSC_COMM_SELF, n_row_h, is_row_host_arr, PETSC_COPY_VALUES, &is_row_temp)); -// PetscCallVoid(ISCreateGeneral(PETSC_COMM_SELF, n_col_h, is_col_host_arr, PETSC_COPY_VALUES, &is_col_temp)); - -// Mat tmp_host_mat = NULL; -// PetscCallVoid(MatCreateSubMatrix(mat_local, is_row_temp, is_col_temp, MAT_INITIAL_MATRIX, &output_mat_local)); -// // Convert the SeqAIJ host result to SeqAIJKokkos so the downstream -// // MatCreateMPIAIJWithSeqAIJ + reuse storage hand-off still get a Kokkos -// // seq block (matches what MatCreateSubMatrix_Seq_kokkos would have -// // produced). -// //PetscCallVoid(MatConvert(tmp_host_mat, MATSEQAIJKOKKOS, MAT_INITIAL_MATRIX, &output_mat_local)); - -// //PetscCallVoid(MatDestroy(&tmp_host_mat)); -// PetscCallVoid(ISDestroy(&is_row_temp)); -// PetscCallVoid(ISDestroy(&is_col_temp)); -// PetscCallVoid(PetscFree(is_row_host_arr)); -// PetscCallVoid(PetscFree(is_col_host_arr)); -// } -// else -// { -// MatCreateSubMatrix_Seq_kokkos(&mat_local, is_row_d_d, is_col_d_d, reuse_int, &output_mat_local); -// } -// #else -// MatCreateSubMatrix_Seq_kokkos(&mat_local, is_row_d_d, is_col_d_d, reuse_int, &output_mat_local); +// // if (reuse_int) +// // { +// // PetscCallVoid(MatMPIAIJGetSeqAIJ(*output_mat, &output_mat_local, &output_mat_nonlocal, NULL)); +// // } +// // } +// // else +// // { +// // mat_local = *input_mat; +// // if (reuse_int) output_mat_local = *output_mat; +// // } +// size_t bytes = 0; + +// // Ablation toggle (Step 2 of plan): when defined non-zero, the diagonal +// // MatCreateSubMatrix_Seq_kokkos call is replaced by PETSc's host-side +// // MatCreateSubMatrix on mat_local plus a MatConvert back to MATSEQAIJKOKKOS. +// // Used to test whether the intermittent GPU crash originates inside the +// // diag Seq_kokkos kernel chain. Reuse path is unchanged (crashes are +// // first-call only). Toggle off (set to 0) to restore the original path. +// #ifndef PFLARE_ABLATE_DIAG_SUBMAT +// #define PFLARE_ABLATE_DIAG_SUBMAT 0 // #endif - // The off-diagonal component requires some comms - // Basically a copy of MatCreateSubMatrix_MPIAIJ_SameRowColDist - -// Off-diagonal ablation toggle (step 2a of plan): when non-zero, the entire -// off-diag VecScatter + Seq_kokkos-nonlocal + MatCreateMPIAIJWithSeqAIJ path -// is replaced by PETSc's CPU MatCreateSubMatrix on the full MPIAIJ input, -// converted back to MATMPIAIJKOKKOS. Combine with PFLARE_ABLATE_DIAG_SUBMAT=0 -// so that only the off-diag section is ablated while diag uses our Kokkos kernel. -// Only the first-call (non-reuse) path is ablated, matching the observed failure mode. -#ifndef PFLARE_ABLATE_OFFDIAG_SUBMAT -#define PFLARE_ABLATE_OFFDIAG_SUBMAT 1 -#endif - - if (mpi) - { -#if PFLARE_ABLATE_OFFDIAG_SUBMAT - if (!reuse_int) - { -// // We need global IS indices (is_row/is_col on device are already LOCAL, -// // i.e. row_global - global_row_start; add back the offset before calling -// // PETSc's CPU MatCreateSubMatrix which expects global indices). -// PetscInt global_row_start_abl = 0, global_row_end_abl = 0; -// PetscInt global_col_start_abl = 0, global_col_end_abl = 0; -// PetscCallVoid(MatGetOwnershipRange(*input_mat, &global_row_start_abl, &global_row_end_abl)); -// PetscCallVoid(MatGetOwnershipRangeColumn(*input_mat, &global_col_start_abl, &global_col_end_abl)); - -// const PetscInt n_row_abl = (PetscInt)is_row_d_d.extent(0); -// const PetscInt n_col_abl = (PetscInt)is_col_d_d.extent(0); -// PetscInt *is_row_g_arr = NULL, *is_col_g_arr = NULL; -// PetscCallVoid(PetscMalloc1(n_row_abl > 0 ? n_row_abl : 1, &is_row_g_arr)); -// PetscCallVoid(PetscMalloc1(n_col_abl > 0 ? n_col_abl : 1, &is_col_g_arr)); - -// // Copy local device indices to host then shift back to global. -// PetscIntKokkosViewHost is_row_g_h(is_row_g_arr, n_row_abl); -// PetscIntKokkosViewHost is_col_g_h(is_col_g_arr, n_col_abl); -// Kokkos::deep_copy(exec, is_row_g_h, is_row_d_d); -// Kokkos::deep_copy(exec, is_col_g_h, is_col_d_d); -// Kokkos::fence(); -// for (PetscInt ii = 0; ii < n_row_abl; ii++) is_row_g_arr[ii] += global_row_start_abl; -// for (PetscInt ii = 0; ii < n_col_abl; ii++) is_col_g_arr[ii] += global_col_start_abl; - -// IS is_row_g_abl = NULL, is_col_g_abl = NULL; -// PetscCallVoid(ISCreateGeneral(MPI_COMM_MATRIX, n_row_abl, is_row_g_arr, PETSC_OWN_POINTER, &is_row_g_abl)); -// PetscCallVoid(ISCreateGeneral(MPI_COMM_MATRIX, n_col_abl, is_col_g_arr, PETSC_OWN_POINTER, &is_col_g_abl)); - -// PetscBool equal_flag; -// PetscCallVoid(ISEqualUnsorted(is_row_g_abl, *rows_rows, &equal_flag)); +// // // The diagonal component +// // #if PFLARE_ABLATE_DIAG_SUBMAT +// // if (!reuse_int) +// // { +// // // Pull the (already-local) is_row / is_col indices back to the host so +// // // PETSc's CPU MatCreateSubMatrix can consume them. mat_local is a +// // // SeqAIJKokkos but PETSc's MatCreateSubMatrix dispatches to the host +// // // SeqAIJ implementation, producing a SeqAIJ result that we then convert +// // // back to SeqAIJKokkos for the downstream MatCreateMPIAIJWithSeqAIJ. +// // const PetscInt n_row_h = is_row_d_d.extent(0); +// // const PetscInt n_col_h = is_col_d_d.extent(0); +// // PetscInt *is_row_host_arr = NULL, *is_col_host_arr = NULL; +// // PetscCallVoid(PetscMalloc1(n_row_h > 0 ? n_row_h : 1, &is_row_host_arr)); +// // PetscCallVoid(PetscMalloc1(n_col_h > 0 ? n_col_h : 1, &is_col_host_arr)); +// // PetscIntKokkosViewHost is_row_h_view(is_row_host_arr, n_row_h); +// // PetscIntKokkosViewHost is_col_h_view(is_col_host_arr, n_col_h); +// // Kokkos::deep_copy(exec, is_row_h_view, is_row_d_d); +// // Kokkos::deep_copy(exec, is_col_h_view, is_col_d_d); +// // Kokkos::fence(); + +// // IS is_row_temp = NULL, is_col_temp = NULL; +// // PetscCallVoid(ISCreateGeneral(PETSC_COMM_SELF, n_row_h, is_row_host_arr, PETSC_COPY_VALUES, &is_row_temp)); +// // PetscCallVoid(ISCreateGeneral(PETSC_COMM_SELF, n_col_h, is_col_host_arr, PETSC_COPY_VALUES, &is_col_temp)); + +// // Mat tmp_host_mat = NULL; +// // PetscCallVoid(MatCreateSubMatrix(mat_local, is_row_temp, is_col_temp, MAT_INITIAL_MATRIX, &output_mat_local)); +// // // Convert the SeqAIJ host result to SeqAIJKokkos so the downstream +// // // MatCreateMPIAIJWithSeqAIJ + reuse storage hand-off still get a Kokkos +// // // seq block (matches what MatCreateSubMatrix_Seq_kokkos would have +// // // produced). +// // //PetscCallVoid(MatConvert(tmp_host_mat, MATSEQAIJKOKKOS, MAT_INITIAL_MATRIX, &output_mat_local)); + +// // //PetscCallVoid(MatDestroy(&tmp_host_mat)); +// // PetscCallVoid(ISDestroy(&is_row_temp)); +// // PetscCallVoid(ISDestroy(&is_col_temp)); +// // PetscCallVoid(PetscFree(is_row_host_arr)); +// // PetscCallVoid(PetscFree(is_col_host_arr)); +// // } +// // else +// // { +// // MatCreateSubMatrix_Seq_kokkos(&mat_local, is_row_d_d, is_col_d_d, reuse_int, &output_mat_local); +// // } +// // #else +// // MatCreateSubMatrix_Seq_kokkos(&mat_local, is_row_d_d, is_col_d_d, reuse_int, &output_mat_local); +// // #endif + +// // The off-diagonal component requires some comms +// // Basically a copy of MatCreateSubMatrix_MPIAIJ_SameRowColDist + +// // Off-diagonal ablation toggle (step 2a of plan): when non-zero, the entire +// // off-diag VecScatter + Seq_kokkos-nonlocal + MatCreateMPIAIJWithSeqAIJ path +// // is replaced by PETSc's CPU MatCreateSubMatrix on the full MPIAIJ input, +// // converted back to MATMPIAIJKOKKOS. Combine with PFLARE_ABLATE_DIAG_SUBMAT=0 +// // so that only the off-diag section is ablated while diag uses our Kokkos kernel. +// // Only the first-call (non-reuse) path is ablated, matching the observed failure mode. +// #ifndef PFLARE_ABLATE_OFFDIAG_SUBMAT +// #define PFLARE_ABLATE_OFFDIAG_SUBMAT 1 +// #endif -// PetscCheckAbort(equal_flag, MPI_COMM_MATRIX, -// PETSC_ERR_PLIB, -// "rows not equal"); +// if (mpi) +// { +// #if PFLARE_ABLATE_OFFDIAG_SUBMAT +// if (!reuse_int) +// { +// // // We need global IS indices (is_row/is_col on device are already LOCAL, +// // // i.e. row_global - global_row_start; add back the offset before calling +// // // PETSc's CPU MatCreateSubMatrix which expects global indices). +// // PetscInt global_row_start_abl = 0, global_row_end_abl = 0; +// // PetscInt global_col_start_abl = 0, global_col_end_abl = 0; +// // PetscCallVoid(MatGetOwnershipRange(*input_mat, &global_row_start_abl, &global_row_end_abl)); +// // PetscCallVoid(MatGetOwnershipRangeColumn(*input_mat, &global_col_start_abl, &global_col_end_abl)); + +// // const PetscInt n_row_abl = (PetscInt)is_row_d_d.extent(0); +// // const PetscInt n_col_abl = (PetscInt)is_col_d_d.extent(0); +// // PetscInt *is_row_g_arr = NULL, *is_col_g_arr = NULL; +// // PetscCallVoid(PetscMalloc1(n_row_abl > 0 ? n_row_abl : 1, &is_row_g_arr)); +// // PetscCallVoid(PetscMalloc1(n_col_abl > 0 ? n_col_abl : 1, &is_col_g_arr)); + +// // // Copy local device indices to host then shift back to global. +// // PetscIntKokkosViewHost is_row_g_h(is_row_g_arr, n_row_abl); +// // PetscIntKokkosViewHost is_col_g_h(is_col_g_arr, n_col_abl); +// // Kokkos::deep_copy(exec, is_row_g_h, is_row_d_d); +// // Kokkos::deep_copy(exec, is_col_g_h, is_col_d_d); +// // Kokkos::fence(); +// // for (PetscInt ii = 0; ii < n_row_abl; ii++) is_row_g_arr[ii] += global_row_start_abl; +// // for (PetscInt ii = 0; ii < n_col_abl; ii++) is_col_g_arr[ii] += global_col_start_abl; + +// // IS is_row_g_abl = NULL, is_col_g_abl = NULL; +// // PetscCallVoid(ISCreateGeneral(MPI_COMM_MATRIX, n_row_abl, is_row_g_arr, PETSC_OWN_POINTER, &is_row_g_abl)); +// // PetscCallVoid(ISCreateGeneral(MPI_COMM_MATRIX, n_col_abl, is_col_g_arr, PETSC_OWN_POINTER, &is_col_g_abl)); + +// // PetscBool equal_flag; +// // PetscCallVoid(ISEqualUnsorted(is_row_g_abl, *rows_rows, &equal_flag)); + +// // PetscCheckAbort(equal_flag, MPI_COMM_MATRIX, +// // PETSC_ERR_PLIB, +// // "rows not equal"); -// PetscCallVoid(ISEqualUnsorted(is_col_g_abl, *cols_cols, &equal_flag)); - -// PetscCheckAbort(equal_flag, MPI_COMM_MATRIX, -// PETSC_ERR_PLIB, -// "cols not equal"); - -// Mat tmp_abl = NULL; - //PetscCallVoid(MatCreateSubMatrix(*input_mat, is_row_g_abl, is_col_g_abl, MAT_INITIAL_MATRIX, output_mat)); - PetscCallVoid(MatCreateSubMatrix(*input_mat, *rows_rows, *cols_cols, MAT_INITIAL_MATRIX, output_mat)); - //PetscCallVoid(MatConvert(tmp_abl, MATMPIAIJKOKKOS, MAT_INITIAL_MATRIX, output_mat)); - //PetscCallVoid(MatDestroy(&tmp_abl)); - //PetscCallVoid(MatDestroy(&output_mat_local)); // diag mat no longer needed - //PetscCallVoid(ISDestroy(&is_row_g_abl)); - //PetscCallVoid(ISDestroy(&is_col_g_abl)); - return; - } -#endif - PetscIntKokkosView is_col_o_d, garray_output_d; - - if (!reuse_int) - { - PetscInt isstart = 0; - /* Get start indices on each rank for the new columns */ - MPI_Scan(&local_cols_col, &isstart, 1, MPIU_INT, MPI_SUM, MPI_COMM_MATRIX); - isstart -= local_cols_col; - - // cmap values are encoded through PetscScalar and then cast back to PetscInt, - // so guard the exact integer range before using VecScatter transport. - // Anything larger than 9,000 trillion with 64 bit ints and 64 bit floats will break - should be fine for now - // Can't rely on PetscSFBcast with MPIU_INT as that was intermittently breaking - // on gpus so want to avoid - PetscInt max_encoded_value = global_cols_col > 0 ? global_cols_col - 1 : 0; - PetscCallVoid(check_exact_petscint_to_scalar_encoding(max_encoded_value, MPI_COMM_MATRIX)); - - // Kokkos version of ISGetSeqIS_SameColDist_Private (mpiaij.c) - // Uses VecScatter with PetscScalar Vecs (matching PETSc's own pattern) - // instead of direct PetscSFBcast with MPIU_INT on temporary views. - - std::cerr << "one " << std::endl; - - /* (1) iscol is a sub-column vector of mat, pad it with '-1.' to form a full vector x */ - Vec x_vec, cmap_vec; - PetscCallVoid(MatCreateVecs(*input_mat, &x_vec, NULL)); - PetscCallVoid(VecDuplicate(x_vec, &cmap_vec)); - - // Fill x_vec on device: x[is_col(i)] = is_col(i), rest = -1 +// // PetscCallVoid(ISEqualUnsorted(is_col_g_abl, *cols_cols, &equal_flag)); + +// // PetscCheckAbort(equal_flag, MPI_COMM_MATRIX, +// // PETSC_ERR_PLIB, +// // "cols not equal"); + +// // Mat tmp_abl = NULL; +// //PetscCallVoid(MatCreateSubMatrix(*input_mat, is_row_g_abl, is_col_g_abl, MAT_INITIAL_MATRIX, output_mat)); + PetscCallVoid(MatCreateSubMatrix(*input_mat, *rows_rows, *cols_cols, MAT_INITIAL_MATRIX, output_mat)); +// //PetscCallVoid(MatConvert(tmp_abl, MATMPIAIJKOKKOS, MAT_INITIAL_MATRIX, output_mat)); +// //PetscCallVoid(MatDestroy(&tmp_abl)); +// //PetscCallVoid(MatDestroy(&output_mat_local)); // diag mat no longer needed +// //PetscCallVoid(ISDestroy(&is_row_g_abl)); +// //PetscCallVoid(ISDestroy(&is_col_g_abl)); + return; +// } +// #endif +// PetscIntKokkosView is_col_o_d, garray_output_d; + +// if (!reuse_int) +// { +// PetscInt isstart = 0; +// /* Get start indices on each rank for the new columns */ +// MPI_Scan(&local_cols_col, &isstart, 1, MPIU_INT, MPI_SUM, MPI_COMM_MATRIX); +// isstart -= local_cols_col; + +// // cmap values are encoded through PetscScalar and then cast back to PetscInt, +// // so guard the exact integer range before using VecScatter transport. +// // Anything larger than 9,000 trillion with 64 bit ints and 64 bit floats will break - should be fine for now +// // Can't rely on PetscSFBcast with MPIU_INT as that was intermittently breaking +// // on gpus so want to avoid +// PetscInt max_encoded_value = global_cols_col > 0 ? global_cols_col - 1 : 0; +// PetscCallVoid(check_exact_petscint_to_scalar_encoding(max_encoded_value, MPI_COMM_MATRIX)); + +// // Kokkos version of ISGetSeqIS_SameColDist_Private (mpiaij.c) +// // Uses VecScatter with PetscScalar Vecs (matching PETSc's own pattern) +// // instead of direct PetscSFBcast with MPIU_INT on temporary views. + +// std::cerr << "one " << std::endl; + +// /* (1) iscol is a sub-column vector of mat, pad it with '-1.' to form a full vector x */ +// Vec x_vec, cmap_vec; +// PetscCallVoid(MatCreateVecs(*input_mat, &x_vec, NULL)); +// PetscCallVoid(VecDuplicate(x_vec, &cmap_vec)); + +// // Fill x_vec on device: x[is_col(i)] = is_col(i), rest = -1 - PetscScalarKokkosView x_scalar_d; - PetscCallVoid(VecGetKokkosViewWrite(x_vec, &x_scalar_d)); - Kokkos::deep_copy(exec, x_scalar_d, -1.0); - Kokkos::parallel_for( - Kokkos::RangePolicy<>(exec, 0, local_cols_col), KOKKOS_LAMBDA(PetscInt i) { - x_scalar_d(is_col_d_d(i)) = (PetscScalar)is_col_d_d(i); - }); - PetscCallVoid(VecRestoreKokkosViewWrite(x_vec, &x_scalar_d)); +// PetscScalarKokkosView x_scalar_d; +// PetscCallVoid(VecGetKokkosViewWrite(x_vec, &x_scalar_d)); +// Kokkos::deep_copy(exec, x_scalar_d, -1.0); +// Kokkos::parallel_for( +// Kokkos::RangePolicy<>(exec, 0, local_cols_col), KOKKOS_LAMBDA(PetscInt i) { +// x_scalar_d(is_col_d_d(i)) = (PetscScalar)is_col_d_d(i); +// }); +// PetscCallVoid(VecRestoreKokkosViewWrite(x_vec, &x_scalar_d)); - std::cerr << "two " << std::endl; +// std::cerr << "two " << std::endl; - /* (2) Scatter x and cmap using Mvctx to get their off-process portions */ - // Keep at most one active communication on Mvctx at a time. - // While Begin/End is in flight, do not touch the corresponding send/recv buffers. - Vec x_leaf_vec; - PetscCallVoid(VecDuplicate(mat_mpi->lvec, &x_leaf_vec)); - // Ensure send/receive buffers are stable before Begin. - Kokkos::fence(); - std::cerr << "two a " << std::endl; +// /* (2) Scatter x and cmap using Mvctx to get their off-process portions */ +// // Keep at most one active communication on Mvctx at a time. +// // While Begin/End is in flight, do not touch the corresponding send/recv buffers. +// Vec x_leaf_vec; +// PetscCallVoid(VecDuplicate(mat_mpi->lvec, &x_leaf_vec)); +// // Ensure send/receive buffers are stable before Begin. +// Kokkos::fence(); +// std::cerr << "two a " << std::endl; - PetscCallVoid(VecScatterBegin(mat_mpi->Mvctx, x_vec, x_leaf_vec, INSERT_VALUES, SCATTER_FORWARD)); - // x scatter completed: x_leaf_vec is now safe to read. - PetscCallVoid(VecScatterEnd(mat_mpi->Mvctx, x_vec, x_leaf_vec, INSERT_VALUES, SCATTER_FORWARD)); +// PetscCallVoid(VecScatterBegin(mat_mpi->Mvctx, x_vec, x_leaf_vec, INSERT_VALUES, SCATTER_FORWARD)); +// // x scatter completed: x_leaf_vec is now safe to read. +// PetscCallVoid(VecScatterEnd(mat_mpi->Mvctx, x_vec, x_leaf_vec, INSERT_VALUES, SCATTER_FORWARD)); - std::cerr << "two b" << std::endl; +// std::cerr << "two b" << std::endl; - // Fill cmap_vec on device: cmap[is_col(i)] = i + isstart, rest = -1 +// // Fill cmap_vec on device: cmap[is_col(i)] = i + isstart, rest = -1 - PetscScalarKokkosView cmap_scalar_d; - PetscCallVoid(VecGetKokkosViewWrite(cmap_vec, &cmap_scalar_d)); - Kokkos::deep_copy(exec, cmap_scalar_d, -1.0); - Kokkos::parallel_for( - Kokkos::RangePolicy<>(exec, 0, local_cols_col), KOKKOS_LAMBDA(PetscInt i) { - cmap_scalar_d(is_col_d_d(i)) = (PetscScalar)(i + isstart); - }); - PetscCallVoid(VecRestoreKokkosViewWrite(cmap_vec, &cmap_scalar_d)); +// PetscScalarKokkosView cmap_scalar_d; +// PetscCallVoid(VecGetKokkosViewWrite(cmap_vec, &cmap_scalar_d)); +// Kokkos::deep_copy(exec, cmap_scalar_d, -1.0); +// Kokkos::parallel_for( +// Kokkos::RangePolicy<>(exec, 0, local_cols_col), KOKKOS_LAMBDA(PetscInt i) { +// cmap_scalar_d(is_col_d_d(i)) = (PetscScalar)(i + isstart); +// }); +// PetscCallVoid(VecRestoreKokkosViewWrite(cmap_vec, &cmap_scalar_d)); - std::cerr << "three " << std::endl; - - Vec lcmap_vec; - PetscCallVoid(VecDuplicate(mat_mpi->lvec, &lcmap_vec)); - - /* (3) Count how many off-local columns match */ - PetscInt col_ao_output = 0; - - // One bigger for exclusive scan - auto is_col_o_match_d = PetscIntKokkosView("is_col_o_match_d", cols_ao+1); - Kokkos::deep_copy(exec, is_col_o_match_d, 0); - - // Start cmap scatter only after finishing x scatter on the same Mvctx. - // Ensure send/receive buffers are stable before Begin. - Kokkos::fence(); - PetscCallVoid(VecScatterBegin(mat_mpi->Mvctx, cmap_vec, lcmap_vec, INSERT_VALUES, SCATTER_FORWARD)); - // cmap scatter completed: lcmap_vec is now safe to read. - PetscCallVoid(VecScatterEnd(mat_mpi->Mvctx, cmap_vec, lcmap_vec, INSERT_VALUES, SCATTER_FORWARD)); - - //if (cols_ao > 0) - //{ - ConstPetscScalarKokkosView lvec_scalar_d; - PetscCallVoid(VecGetKokkosView(x_leaf_vec, &lvec_scalar_d)); - - Kokkos::parallel_reduce("FindMatches", Kokkos::RangePolicy<>(exec, 0, cols_ao), - KOKKOS_LAMBDA(const PetscInt i, PetscInt& thread_sum) { - // This is the scattered x for all of the non-local columns in the input mat - // It's not -1.0 if that column is present on another rank - if (lvec_scalar_d(i) > -1.0) { - thread_sum++; - is_col_o_match_d(i) = 1; // Mark this as a match - } - }, - Kokkos::Sum(col_ao_output) - ); - - PetscCallVoid(VecRestoreKokkosView(x_leaf_vec, &lvec_scalar_d)); - //} - - std::cerr << "four " << std::endl; - - - // Need to do an exclusive scan on is_col_o_match_d to get the new local indices - // Have to remember to go up to cols_ao+1 - Kokkos::parallel_scan(Kokkos::RangePolicy<>(exec, 0, cols_ao+1), KOKKOS_LAMBDA(const PetscInt i, PetscInt& partial_sum, const bool is_final) { - const int input_value = is_col_o_match_d(i); - if (is_final) { - is_col_o_match_d(i) = partial_sum; // Write exclusive prefix - } - partial_sum += input_value; // Update running total - }); - - // ~~~~~~~~~~~~ - // DIAGNOSTIC (Step 1 of plan): the parallel_reduce above produced - // col_ao_output on the host while the scan produced the per-index - // prefix sum on device. They must agree on the total count; if they - // don't, the size of is_col_o_d / garray_output_d below is wrong and - // the subsequent scatter kernel will write out of bounds. - // ~~~~~~~~~~~~ - { - PetscInt scan_total_h = 0; - auto tail_sv = Kokkos::subview(is_col_o_match_d, cols_ao); - Kokkos::View tail_h("PFLARE_DBG_scan_tail"); - Kokkos::deep_copy(exec, tail_h, tail_sv); - Kokkos::fence(); - scan_total_h = tail_h(); - PetscCheckAbort(scan_total_h == col_ao_output, MPI_COMM_MATRIX, - PETSC_ERR_PLIB, - "MatCreateSubMatrix_kokkos_view: parallel_reduce count (%" PetscInt_FMT ") disagrees with scan total (%" PetscInt_FMT "), cols_ao=%" PetscInt_FMT, - col_ao_output, scan_total_h, cols_ao); - PetscCheckAbort(col_ao_output >= 0 && col_ao_output <= cols_ao, MPI_COMM_MATRIX, - PETSC_ERR_PLIB, - "MatCreateSubMatrix_kokkos_view: col_ao_output=%" PetscInt_FMT " outside [0,%" PetscInt_FMT "]", - col_ao_output, cols_ao); - } - - // Local indices into input garray of the columns we want to keep - // but remember this doesn't mean garray_output = garray_input(is_col_o_d) - // as the of columns we have in the output has changed, ie we need - // the cmap_d given it has isstart - is_col_o_d = PetscIntKokkosView("is_col_o_d", col_ao_output); - garray_output_d = PetscIntKokkosView("garray_output_d", col_ao_output); - - // Loop over all the cols in the input matrix - //{ - ConstPetscScalarKokkosView lcmap_scalar_d; - PetscCallVoid(VecGetKokkosView(lcmap_vec, &lcmap_scalar_d)); - - Kokkos::parallel_for( - Kokkos::RangePolicy<>(exec, 0, cols_ao), KOKKOS_LAMBDA(PetscInt i) { - - // We can tell if is_col_o_match_d had 1 in it in this position by comparing the result - // of the exclusive scan for this index and the next one - if (is_col_o_match_d(i+1) > is_col_o_match_d(i)) - { - is_col_o_d(is_col_o_match_d(i)) = i; - garray_output_d(is_col_o_match_d(i)) = (PetscInt)lcmap_scalar_d(i); - } - }); - // Fence so the parallel for finishes - Kokkos::fence(); - - PetscCallVoid(VecRestoreKokkosView(lcmap_vec, &lcmap_scalar_d)); - //} - - std::cerr << "five " << std::endl; - - - // Cleanup Vecs - PetscCallVoid(VecDestroy(&x_vec)); - PetscCallVoid(VecDestroy(&x_leaf_vec)); - PetscCallVoid(VecDestroy(&cmap_vec)); - PetscCallVoid(VecDestroy(&lcmap_vec)); - } - // If we're reusing we have the iscol_o associated with the output_mat - else - { - // Get the iscol_o from the output_mat - IS iscol_o; - /* Retrieve isrow_d, iscol_d and iscol_o from output */ - PetscCallVoid(PetscObjectQuery((PetscObject)(*output_mat), "iscol_o", (PetscObject *)&iscol_o)); - //PetscCheck(iscol_o, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "iscol_o passed in was not used before, cannot reuse"); - - const PetscInt *iscol_o_indices_ptr; - PetscCallVoid(ISGetIndices(iscol_o, &iscol_o_indices_ptr)); - - PetscInt local_cols_iscol_o; - PetscCallVoid(ISGetLocalSize(iscol_o, &local_cols_iscol_o)); - - // Copy the iscol_o to the device - auto iscol_o_view_h = PetscIntConstKokkosViewHost(iscol_o_indices_ptr, local_cols_iscol_o); - is_col_o_d = PetscIntKokkosView("is_col_o_d", local_cols_iscol_o); - Kokkos::deep_copy(exec, is_col_o_d, iscol_o_view_h); - // Log copy with petsc - bytes = iscol_o_view_h.extent(0) * sizeof(PetscInt); - PetscCallVoid(PetscLogCpuToGpu(bytes)); - Kokkos::fence(); - - PetscCallVoid(ISRestoreIndices(iscol_o, &iscol_o_indices_ptr)); - } - - // We can now create the off-diagonal component - Kokkos::fence(); - MatCreateSubMatrix_Seq_kokkos(&mat_nonlocal, is_row_d_d, is_col_o_d, reuse_int, &output_mat_nonlocal); +// std::cerr << "three " << std::endl; + +// Vec lcmap_vec; +// PetscCallVoid(VecDuplicate(mat_mpi->lvec, &lcmap_vec)); + +// /* (3) Count how many off-local columns match */ +// PetscInt col_ao_output = 0; + +// // One bigger for exclusive scan +// auto is_col_o_match_d = PetscIntKokkosView("is_col_o_match_d", cols_ao+1); +// Kokkos::deep_copy(exec, is_col_o_match_d, 0); + +// // Start cmap scatter only after finishing x scatter on the same Mvctx. +// // Ensure send/receive buffers are stable before Begin. +// Kokkos::fence(); +// PetscCallVoid(VecScatterBegin(mat_mpi->Mvctx, cmap_vec, lcmap_vec, INSERT_VALUES, SCATTER_FORWARD)); +// // cmap scatter completed: lcmap_vec is now safe to read. +// PetscCallVoid(VecScatterEnd(mat_mpi->Mvctx, cmap_vec, lcmap_vec, INSERT_VALUES, SCATTER_FORWARD)); + +// //if (cols_ao > 0) +// //{ +// ConstPetscScalarKokkosView lvec_scalar_d; +// PetscCallVoid(VecGetKokkosView(x_leaf_vec, &lvec_scalar_d)); + +// Kokkos::parallel_reduce("FindMatches", Kokkos::RangePolicy<>(exec, 0, cols_ao), +// KOKKOS_LAMBDA(const PetscInt i, PetscInt& thread_sum) { +// // This is the scattered x for all of the non-local columns in the input mat +// // It's not -1.0 if that column is present on another rank +// if (lvec_scalar_d(i) > -1.0) { +// thread_sum++; +// is_col_o_match_d(i) = 1; // Mark this as a match +// } +// }, +// Kokkos::Sum(col_ao_output) +// ); + +// PetscCallVoid(VecRestoreKokkosView(x_leaf_vec, &lvec_scalar_d)); +// //} + +// std::cerr << "four " << std::endl; + + +// // Need to do an exclusive scan on is_col_o_match_d to get the new local indices +// // Have to remember to go up to cols_ao+1 +// Kokkos::parallel_scan(Kokkos::RangePolicy<>(exec, 0, cols_ao+1), KOKKOS_LAMBDA(const PetscInt i, PetscInt& partial_sum, const bool is_final) { +// const int input_value = is_col_o_match_d(i); +// if (is_final) { +// is_col_o_match_d(i) = partial_sum; // Write exclusive prefix +// } +// partial_sum += input_value; // Update running total +// }); + +// // ~~~~~~~~~~~~ +// // DIAGNOSTIC (Step 1 of plan): the parallel_reduce above produced +// // col_ao_output on the host while the scan produced the per-index +// // prefix sum on device. They must agree on the total count; if they +// // don't, the size of is_col_o_d / garray_output_d below is wrong and +// // the subsequent scatter kernel will write out of bounds. +// // ~~~~~~~~~~~~ +// { +// PetscInt scan_total_h = 0; +// auto tail_sv = Kokkos::subview(is_col_o_match_d, cols_ao); +// Kokkos::View tail_h("PFLARE_DBG_scan_tail"); +// Kokkos::deep_copy(exec, tail_h, tail_sv); +// Kokkos::fence(); +// scan_total_h = tail_h(); +// PetscCheckAbort(scan_total_h == col_ao_output, MPI_COMM_MATRIX, +// PETSC_ERR_PLIB, +// "MatCreateSubMatrix_kokkos_view: parallel_reduce count (%" PetscInt_FMT ") disagrees with scan total (%" PetscInt_FMT "), cols_ao=%" PetscInt_FMT, +// col_ao_output, scan_total_h, cols_ao); +// PetscCheckAbort(col_ao_output >= 0 && col_ao_output <= cols_ao, MPI_COMM_MATRIX, +// PETSC_ERR_PLIB, +// "MatCreateSubMatrix_kokkos_view: col_ao_output=%" PetscInt_FMT " outside [0,%" PetscInt_FMT "]", +// col_ao_output, cols_ao); +// } + +// // Local indices into input garray of the columns we want to keep +// // but remember this doesn't mean garray_output = garray_input(is_col_o_d) +// // as the of columns we have in the output has changed, ie we need +// // the cmap_d given it has isstart +// is_col_o_d = PetscIntKokkosView("is_col_o_d", col_ao_output); +// garray_output_d = PetscIntKokkosView("garray_output_d", col_ao_output); + +// // Loop over all the cols in the input matrix +// //{ +// ConstPetscScalarKokkosView lcmap_scalar_d; +// PetscCallVoid(VecGetKokkosView(lcmap_vec, &lcmap_scalar_d)); + +// Kokkos::parallel_for( +// Kokkos::RangePolicy<>(exec, 0, cols_ao), KOKKOS_LAMBDA(PetscInt i) { + +// // We can tell if is_col_o_match_d had 1 in it in this position by comparing the result +// // of the exclusive scan for this index and the next one +// if (is_col_o_match_d(i+1) > is_col_o_match_d(i)) +// { +// is_col_o_d(is_col_o_match_d(i)) = i; +// garray_output_d(is_col_o_match_d(i)) = (PetscInt)lcmap_scalar_d(i); +// } +// }); +// // Fence so the parallel for finishes +// Kokkos::fence(); + +// PetscCallVoid(VecRestoreKokkosView(lcmap_vec, &lcmap_scalar_d)); +// //} + +// std::cerr << "five " << std::endl; + + +// // Cleanup Vecs +// PetscCallVoid(VecDestroy(&x_vec)); +// PetscCallVoid(VecDestroy(&x_leaf_vec)); +// PetscCallVoid(VecDestroy(&cmap_vec)); +// PetscCallVoid(VecDestroy(&lcmap_vec)); +// } +// // If we're reusing we have the iscol_o associated with the output_mat +// else +// { +// // Get the iscol_o from the output_mat +// IS iscol_o; +// /* Retrieve isrow_d, iscol_d and iscol_o from output */ +// PetscCallVoid(PetscObjectQuery((PetscObject)(*output_mat), "iscol_o", (PetscObject *)&iscol_o)); +// //PetscCheck(iscol_o, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "iscol_o passed in was not used before, cannot reuse"); + +// const PetscInt *iscol_o_indices_ptr; +// PetscCallVoid(ISGetIndices(iscol_o, &iscol_o_indices_ptr)); + +// PetscInt local_cols_iscol_o; +// PetscCallVoid(ISGetLocalSize(iscol_o, &local_cols_iscol_o)); + +// // Copy the iscol_o to the device +// auto iscol_o_view_h = PetscIntConstKokkosViewHost(iscol_o_indices_ptr, local_cols_iscol_o); +// is_col_o_d = PetscIntKokkosView("is_col_o_d", local_cols_iscol_o); +// Kokkos::deep_copy(exec, is_col_o_d, iscol_o_view_h); +// // Log copy with petsc +// bytes = iscol_o_view_h.extent(0) * sizeof(PetscInt); +// PetscCallVoid(PetscLogCpuToGpu(bytes)); +// Kokkos::fence(); - // If it's our first time through we have to create our output matrix - if (!reuse_int) - { - std::cerr << "six " << std::endl; +// PetscCallVoid(ISRestoreIndices(iscol_o, &iscol_o_indices_ptr)); +// } - // Copy the garray output to the host - PetscInt *garray_host = NULL; - PetscCallVoid(PetscMalloc1(garray_output_d.extent(0), &garray_host)); - PetscIntKokkosViewHost colmap_output_h = PetscIntKokkosViewHost(garray_host, garray_output_d.extent(0)); - // Copy the garray output to the host - Kokkos::deep_copy(exec, colmap_output_h, garray_output_d); - Kokkos::fence(); - bytes = colmap_output_h.extent(0) * sizeof(PetscInt); - PetscCallVoid(PetscLogGpuToCpu(bytes)); +// // We can now create the off-diagonal component +// Kokkos::fence(); +// MatCreateSubMatrix_Seq_kokkos(&mat_nonlocal, is_row_d_d, is_col_o_d, reuse_int, &output_mat_nonlocal); + +// // If it's our first time through we have to create our output matrix +// if (!reuse_int) +// { +// std::cerr << "six " << std::endl; + +// // Copy the garray output to the host +// PetscInt *garray_host = NULL; +// PetscCallVoid(PetscMalloc1(garray_output_d.extent(0), &garray_host)); +// PetscIntKokkosViewHost colmap_output_h = PetscIntKokkosViewHost(garray_host, garray_output_d.extent(0)); +// // Copy the garray output to the host +// Kokkos::deep_copy(exec, colmap_output_h, garray_output_d); +// Kokkos::fence(); +// bytes = colmap_output_h.extent(0) * sizeof(PetscInt); +// PetscCallVoid(PetscLogGpuToCpu(bytes)); - std::cerr << "seven " << std::endl; +// std::cerr << "seven " << std::endl; - // We can now create our MPI matrix - PetscCallVoid(MatCreateMPIAIJWithSeqAIJ(MPI_COMM_MATRIX, global_rows_row, global_cols_col, output_mat_local, output_mat_nonlocal, garray_host, output_mat)); - - std::cerr << "eight " << std::endl; - - // ~~~~~~~~~~~~~~ - // If this is the first time through, we need to store the iscol_o in the output_mat - // We don't store the is_row_d_d or is_col_d_d like the host version does as they're super cheap to rebuild - // ~~~~~~~~~~~~~~ - // Copy the is_col_o_d to the host - PetscInt *is_col_o_host = NULL; - PetscCallVoid(PetscMalloc1(is_col_o_d.extent(0), &is_col_o_host)); - PetscIntKokkosViewHost is_col_o_h = PetscIntKokkosViewHost(is_col_o_host, is_col_o_d.extent(0)); - // Copy the is_col_o_d output to the host - Kokkos::deep_copy(exec, is_col_o_h, is_col_o_d); - Kokkos::fence(); - bytes = is_col_o_h.extent(0) * sizeof(PetscInt); - PetscCallVoid(PetscLogGpuToCpu(bytes)); - // Now create an IS - IS iscol_o; - PetscCallVoid(ISCreateGeneral(PETSC_COMM_SELF, is_col_o_h.extent(0), is_col_o_host, PETSC_COPY_VALUES, &iscol_o)); - // Register it with the output_mat - PetscCallVoid(PetscObjectCompose((PetscObject)(*output_mat), "iscol_o", (PetscObject)iscol_o)); - // The ref counter is incremented by the compose - //PetscCallVoid(ISDestroy(&iscol_o)); - - std::cerr << "nine " << std::endl; - - } - } - else - { - *output_mat = output_mat_local; - } +// // We can now create our MPI matrix +// PetscCallVoid(MatCreateMPIAIJWithSeqAIJ(MPI_COMM_MATRIX, global_rows_row, global_cols_col, output_mat_local, output_mat_nonlocal, garray_host, output_mat)); + +// std::cerr << "eight " << std::endl; + +// // ~~~~~~~~~~~~~~ +// // If this is the first time through, we need to store the iscol_o in the output_mat +// // We don't store the is_row_d_d or is_col_d_d like the host version does as they're super cheap to rebuild +// // ~~~~~~~~~~~~~~ +// // Copy the is_col_o_d to the host +// PetscInt *is_col_o_host = NULL; +// PetscCallVoid(PetscMalloc1(is_col_o_d.extent(0), &is_col_o_host)); +// PetscIntKokkosViewHost is_col_o_h = PetscIntKokkosViewHost(is_col_o_host, is_col_o_d.extent(0)); +// // Copy the is_col_o_d output to the host +// Kokkos::deep_copy(exec, is_col_o_h, is_col_o_d); +// Kokkos::fence(); +// bytes = is_col_o_h.extent(0) * sizeof(PetscInt); +// PetscCallVoid(PetscLogGpuToCpu(bytes)); +// // Now create an IS +// IS iscol_o; +// PetscCallVoid(ISCreateGeneral(PETSC_COMM_SELF, is_col_o_h.extent(0), is_col_o_host, PETSC_COPY_VALUES, &iscol_o)); +// // Register it with the output_mat +// PetscCallVoid(PetscObjectCompose((PetscObject)(*output_mat), "iscol_o", (PetscObject)iscol_o)); +// // The ref counter is incremented by the compose +// //PetscCallVoid(ISDestroy(&iscol_o)); + +// std::cerr << "nine " << std::endl; + +// } +// } +// else +// { +// *output_mat = output_mat_local; +// } - return; +// return; } //------------------------------------------------------------------------------------------------------------------------ From f1029882672c88e0b08607f164fbc15359e1d858 Mon Sep 17 00:00:00 2001 From: sdargavi Date: Fri, 10 Apr 2026 00:27:16 +0100 Subject: [PATCH 53/60] Pass by copy --- src/PETSc_Helperk.kokkos.cxx | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/PETSc_Helperk.kokkos.cxx b/src/PETSc_Helperk.kokkos.cxx index 9d577515..5b350be2 100644 --- a/src/PETSc_Helperk.kokkos.cxx +++ b/src/PETSc_Helperk.kokkos.cxx @@ -2423,8 +2423,8 @@ PETSC_INTERN void MatCreateSubMatrix_Seq_kokkos(Mat *input_mat, PetscIntKokkosVi // as the matrices, ie equivalent to MatCreateSubMatrix_MPIAIJ_SameRowDist // is_col must be sorted // This one uses the views is_row_d_d and is_col_d_d directly, rewritten to be the local indices -PETSC_INTERN void MatCreateSubMatrix_kokkos_view(Mat *input_mat, PetscIntKokkosView &is_row_d_d, PetscInt global_rows_row, \ - PetscIntKokkosView &is_col_d_d, PetscInt global_cols_col, const int reuse_int, Mat *output_mat, IS *rows_rows, IS *cols_cols) +PETSC_INTERN void MatCreateSubMatrix_kokkos_view(Mat *input_mat, PetscIntKokkosView is_row_d_d, PetscInt global_rows_row, \ + PetscIntKokkosView is_col_d_d, PetscInt global_cols_col, const int reuse_int, Mat *output_mat, IS *rows_rows, IS *cols_cols) { // PflareKokkosTrace _trace("MatCreateSubMatrix_kokkos_view"); // PetscInt local_rows, local_cols; From 13e1fd7fbb058923fe11318e04e173b99dd8b20b Mon Sep 17 00:00:00 2001 From: sdargavi Date: Fri, 10 Apr 2026 00:52:39 +0100 Subject: [PATCH 54/60] Remove kokkos code --- src/PETSc_Helperk.kokkos.cxx | 158 ++++++++++++++++++----------------- 1 file changed, 80 insertions(+), 78 deletions(-) diff --git a/src/PETSc_Helperk.kokkos.cxx b/src/PETSc_Helperk.kokkos.cxx index 5b350be2..44540033 100644 --- a/src/PETSc_Helperk.kokkos.cxx +++ b/src/PETSc_Helperk.kokkos.cxx @@ -2884,87 +2884,89 @@ PETSC_INTERN void MatCreateSubMatrix_kokkos(Mat *input_mat, IS *is_row, IS *is_c PetscCallVoid(ISGetSize(*is_row, &global_rows_row)); PetscCallVoid(ISGetSize(*is_col, &global_cols_col)); - // Equivalent to calling MatSeqAIJKokkosSyncDevice which is petsc intern - mat_sync(input_mat); + // // Equivalent to calling MatSeqAIJKokkosSyncDevice which is petsc intern + // mat_sync(input_mat); - PetscIntKokkosView is_row_d_d, is_col_d_d; - const int level_idx = our_level - 1; - auto exec = PetscGetKokkosExecutionSpace(); - - // If we want the input is_row and is_col to be used - if (our_level == -1) - { - // Get pointers to the indices on the host - const PetscInt *is_row_indices_ptr, *is_col_indices_ptr; - PetscCallVoid(ISGetIndices(*is_row, &is_row_indices_ptr)); - PetscCallVoid(ISGetIndices(*is_col, &is_col_indices_ptr)); - - PetscInt local_rows_row, local_cols_col; - PetscCallVoid(ISGetLocalSize(*is_row, &local_rows_row)); - PetscCallVoid(ISGetLocalSize(*is_col, &local_cols_col)); - - // Create a host view of the existing indices - auto is_row_view_h = PetscIntConstKokkosViewHost(is_row_indices_ptr, local_rows_row); - is_row_d_d = PetscIntKokkosView("is_row_d_d", local_rows_row); - auto is_col_view_h = PetscIntConstKokkosViewHost(is_col_indices_ptr, local_cols_col); - is_col_d_d = PetscIntKokkosView("is_col_d_d", local_cols_col); - // Copy indices to the device - Kokkos::deep_copy(exec, is_row_d_d, is_row_view_h); - Kokkos::deep_copy(exec, is_col_d_d, is_col_view_h); - // The source pointers come from ISGetIndices; ensure async copies complete - // before restoring those host buffers. - Kokkos::fence(); - // Log copy with petsc - size_t bytes = is_row_view_h.extent(0) * sizeof(PetscInt); - PetscCallVoid(PetscLogCpuToGpu(bytes)); - bytes = is_col_view_h.extent(0) * sizeof(PetscInt); - PetscCallVoid(PetscLogCpuToGpu(bytes)); - - PetscCallVoid(ISRestoreIndices(*is_row, &is_row_indices_ptr)); - PetscCallVoid(ISRestoreIndices(*is_col, &is_col_indices_ptr)); - - // ~~~~~~~~~~~~ - // Rewrite to local indices - // ~~~~~~~~~~~~ - Kokkos::parallel_for( - Kokkos::RangePolicy<>(exec, 0, is_row_d_d.extent(0)), KOKKOS_LAMBDA(PetscInt i) { - - is_row_d_d(i) -= global_row_start; // Make local - }); - - Kokkos::parallel_for( - Kokkos::RangePolicy<>(exec, 0, is_col_d_d.extent(0)), KOKKOS_LAMBDA(PetscInt i) { - - is_col_d_d(i) -= global_col_start; // Make local - }); - Kokkos::fence(); - } - // Instead if we tell the routine that the is_row and is_col are fine/coarse local indices - // that already are on the device - else - { - if (is_row_fine_int) - { - is_row_d_d = *IS_fine_views_local[level_idx]; - } - else - { - is_row_d_d = *IS_coarse_views_local[level_idx]; - } - if (is_col_fine_int) - { - is_col_d_d = *IS_fine_views_local[level_idx]; - } - else - { - is_col_d_d = *IS_coarse_views_local[level_idx]; - } - } - - // PetscCallVoid(MatCreateSubMatrix(*input_mat, *is_row, *is_col, MAT_INITIAL_MATRIX, output_mat)); + // PetscIntKokkosView is_row_d_d, is_col_d_d; + // const int level_idx = our_level - 1; + // auto exec = PetscGetKokkosExecutionSpace(); + + // // If we want the input is_row and is_col to be used + // if (our_level == -1) + // { + // // Get pointers to the indices on the host + // const PetscInt *is_row_indices_ptr, *is_col_indices_ptr; + // PetscCallVoid(ISGetIndices(*is_row, &is_row_indices_ptr)); + // PetscCallVoid(ISGetIndices(*is_col, &is_col_indices_ptr)); + + // PetscInt local_rows_row, local_cols_col; + // PetscCallVoid(ISGetLocalSize(*is_row, &local_rows_row)); + // PetscCallVoid(ISGetLocalSize(*is_col, &local_cols_col)); + + // // Create a host view of the existing indices + // auto is_row_view_h = PetscIntConstKokkosViewHost(is_row_indices_ptr, local_rows_row); + // is_row_d_d = PetscIntKokkosView("is_row_d_d", local_rows_row); + // auto is_col_view_h = PetscIntConstKokkosViewHost(is_col_indices_ptr, local_cols_col); + // is_col_d_d = PetscIntKokkosView("is_col_d_d", local_cols_col); + // // Copy indices to the device + // Kokkos::deep_copy(exec, is_row_d_d, is_row_view_h); + // Kokkos::deep_copy(exec, is_col_d_d, is_col_view_h); + // // The source pointers come from ISGetIndices; ensure async copies complete + // // before restoring those host buffers. + // Kokkos::fence(); + // // Log copy with petsc + // size_t bytes = is_row_view_h.extent(0) * sizeof(PetscInt); + // PetscCallVoid(PetscLogCpuToGpu(bytes)); + // bytes = is_col_view_h.extent(0) * sizeof(PetscInt); + // PetscCallVoid(PetscLogCpuToGpu(bytes)); + + // PetscCallVoid(ISRestoreIndices(*is_row, &is_row_indices_ptr)); + // PetscCallVoid(ISRestoreIndices(*is_col, &is_col_indices_ptr)); + + // // ~~~~~~~~~~~~ + // // Rewrite to local indices + // // ~~~~~~~~~~~~ + // Kokkos::parallel_for( + // Kokkos::RangePolicy<>(exec, 0, is_row_d_d.extent(0)), KOKKOS_LAMBDA(PetscInt i) { + + // is_row_d_d(i) -= global_row_start; // Make local + // }); + + // Kokkos::parallel_for( + // Kokkos::RangePolicy<>(exec, 0, is_col_d_d.extent(0)), KOKKOS_LAMBDA(PetscInt i) { + + // is_col_d_d(i) -= global_col_start; // Make local + // }); + // Kokkos::fence(); + // } + // // Instead if we tell the routine that the is_row and is_col are fine/coarse local indices + // // that already are on the device + // else + // { + // if (is_row_fine_int) + // { + // is_row_d_d = *IS_fine_views_local[level_idx]; + // } + // else + // { + // is_row_d_d = *IS_coarse_views_local[level_idx]; + // } + // if (is_col_fine_int) + // { + // is_col_d_d = *IS_fine_views_local[level_idx]; + // } + // else + // { + // is_col_d_d = *IS_coarse_views_local[level_idx]; + // } + // } + + // ### path 2 + PetscCallVoid(MatCreateSubMatrix(*input_mat, *is_row, *is_col, MAT_INITIAL_MATRIX, output_mat)); // return; - MatCreateSubMatrix_kokkos_view(input_mat, is_row_d_d, global_rows_row, is_col_d_d, global_cols_col, reuse_int, output_mat, is_row, is_col); + // ### path 1 + // MatCreateSubMatrix_kokkos_view(input_mat, is_row_d_d, global_rows_row, is_col_d_d, global_cols_col, reuse_int, output_mat, is_row, is_col); return; } From 4a27531c30bc857c7d36fd2c20b6a7c530c0d8a3 Mon Sep 17 00:00:00 2001 From: sdargavi Date: Fri, 10 Apr 2026 01:04:29 +0100 Subject: [PATCH 55/60] Allow more of the kokkos 1 --- src/PETSc_Helperk.kokkos.cxx | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/PETSc_Helperk.kokkos.cxx b/src/PETSc_Helperk.kokkos.cxx index 44540033..a06ec5c6 100644 --- a/src/PETSc_Helperk.kokkos.cxx +++ b/src/PETSc_Helperk.kokkos.cxx @@ -2884,12 +2884,12 @@ PETSC_INTERN void MatCreateSubMatrix_kokkos(Mat *input_mat, IS *is_row, IS *is_c PetscCallVoid(ISGetSize(*is_row, &global_rows_row)); PetscCallVoid(ISGetSize(*is_col, &global_cols_col)); - // // Equivalent to calling MatSeqAIJKokkosSyncDevice which is petsc intern - // mat_sync(input_mat); + // Equivalent to calling MatSeqAIJKokkosSyncDevice which is petsc intern + mat_sync(input_mat); - // PetscIntKokkosView is_row_d_d, is_col_d_d; - // const int level_idx = our_level - 1; - // auto exec = PetscGetKokkosExecutionSpace(); + PetscIntKokkosView is_row_d_d, is_col_d_d; + const int level_idx = our_level - 1; + auto exec = PetscGetKokkosExecutionSpace(); // // If we want the input is_row and is_col to be used // if (our_level == -1) From 7e8686372bc0f19eed3c92505f771d7eeea25cbe Mon Sep 17 00:00:00 2001 From: sdargavi Date: Fri, 10 Apr 2026 01:22:17 +0100 Subject: [PATCH 56/60] Keep our_level minus one path --- src/PETSc_Helperk.kokkos.cxx | 96 ++++++++++++++++++------------------ 1 file changed, 48 insertions(+), 48 deletions(-) diff --git a/src/PETSc_Helperk.kokkos.cxx b/src/PETSc_Helperk.kokkos.cxx index a06ec5c6..a101e312 100644 --- a/src/PETSc_Helperk.kokkos.cxx +++ b/src/PETSc_Helperk.kokkos.cxx @@ -2891,54 +2891,54 @@ PETSC_INTERN void MatCreateSubMatrix_kokkos(Mat *input_mat, IS *is_row, IS *is_c const int level_idx = our_level - 1; auto exec = PetscGetKokkosExecutionSpace(); - // // If we want the input is_row and is_col to be used - // if (our_level == -1) - // { - // // Get pointers to the indices on the host - // const PetscInt *is_row_indices_ptr, *is_col_indices_ptr; - // PetscCallVoid(ISGetIndices(*is_row, &is_row_indices_ptr)); - // PetscCallVoid(ISGetIndices(*is_col, &is_col_indices_ptr)); - - // PetscInt local_rows_row, local_cols_col; - // PetscCallVoid(ISGetLocalSize(*is_row, &local_rows_row)); - // PetscCallVoid(ISGetLocalSize(*is_col, &local_cols_col)); - - // // Create a host view of the existing indices - // auto is_row_view_h = PetscIntConstKokkosViewHost(is_row_indices_ptr, local_rows_row); - // is_row_d_d = PetscIntKokkosView("is_row_d_d", local_rows_row); - // auto is_col_view_h = PetscIntConstKokkosViewHost(is_col_indices_ptr, local_cols_col); - // is_col_d_d = PetscIntKokkosView("is_col_d_d", local_cols_col); - // // Copy indices to the device - // Kokkos::deep_copy(exec, is_row_d_d, is_row_view_h); - // Kokkos::deep_copy(exec, is_col_d_d, is_col_view_h); - // // The source pointers come from ISGetIndices; ensure async copies complete - // // before restoring those host buffers. - // Kokkos::fence(); - // // Log copy with petsc - // size_t bytes = is_row_view_h.extent(0) * sizeof(PetscInt); - // PetscCallVoid(PetscLogCpuToGpu(bytes)); - // bytes = is_col_view_h.extent(0) * sizeof(PetscInt); - // PetscCallVoid(PetscLogCpuToGpu(bytes)); - - // PetscCallVoid(ISRestoreIndices(*is_row, &is_row_indices_ptr)); - // PetscCallVoid(ISRestoreIndices(*is_col, &is_col_indices_ptr)); - - // // ~~~~~~~~~~~~ - // // Rewrite to local indices - // // ~~~~~~~~~~~~ - // Kokkos::parallel_for( - // Kokkos::RangePolicy<>(exec, 0, is_row_d_d.extent(0)), KOKKOS_LAMBDA(PetscInt i) { - - // is_row_d_d(i) -= global_row_start; // Make local - // }); - - // Kokkos::parallel_for( - // Kokkos::RangePolicy<>(exec, 0, is_col_d_d.extent(0)), KOKKOS_LAMBDA(PetscInt i) { - - // is_col_d_d(i) -= global_col_start; // Make local - // }); - // Kokkos::fence(); - // } + // If we want the input is_row and is_col to be used + if (our_level == -1) + { + // Get pointers to the indices on the host + const PetscInt *is_row_indices_ptr, *is_col_indices_ptr; + PetscCallVoid(ISGetIndices(*is_row, &is_row_indices_ptr)); + PetscCallVoid(ISGetIndices(*is_col, &is_col_indices_ptr)); + + PetscInt local_rows_row, local_cols_col; + PetscCallVoid(ISGetLocalSize(*is_row, &local_rows_row)); + PetscCallVoid(ISGetLocalSize(*is_col, &local_cols_col)); + + // Create a host view of the existing indices + auto is_row_view_h = PetscIntConstKokkosViewHost(is_row_indices_ptr, local_rows_row); + is_row_d_d = PetscIntKokkosView("is_row_d_d", local_rows_row); + auto is_col_view_h = PetscIntConstKokkosViewHost(is_col_indices_ptr, local_cols_col); + is_col_d_d = PetscIntKokkosView("is_col_d_d", local_cols_col); + // Copy indices to the device + Kokkos::deep_copy(exec, is_row_d_d, is_row_view_h); + Kokkos::deep_copy(exec, is_col_d_d, is_col_view_h); + // The source pointers come from ISGetIndices; ensure async copies complete + // before restoring those host buffers. + Kokkos::fence(); + // Log copy with petsc + size_t bytes = is_row_view_h.extent(0) * sizeof(PetscInt); + PetscCallVoid(PetscLogCpuToGpu(bytes)); + bytes = is_col_view_h.extent(0) * sizeof(PetscInt); + PetscCallVoid(PetscLogCpuToGpu(bytes)); + + PetscCallVoid(ISRestoreIndices(*is_row, &is_row_indices_ptr)); + PetscCallVoid(ISRestoreIndices(*is_col, &is_col_indices_ptr)); + + // ~~~~~~~~~~~~ + // Rewrite to local indices + // ~~~~~~~~~~~~ + Kokkos::parallel_for( + Kokkos::RangePolicy<>(exec, 0, is_row_d_d.extent(0)), KOKKOS_LAMBDA(PetscInt i) { + + is_row_d_d(i) -= global_row_start; // Make local + }); + + Kokkos::parallel_for( + Kokkos::RangePolicy<>(exec, 0, is_col_d_d.extent(0)), KOKKOS_LAMBDA(PetscInt i) { + + is_col_d_d(i) -= global_col_start; // Make local + }); + Kokkos::fence(); + } // // Instead if we tell the routine that the is_row and is_col are fine/coarse local indices // // that already are on the device // else From 5e4c8127bf53ce14e9d5f5a36b1afe0e2f0a4694 Mon Sep 17 00:00:00 2001 From: sdargavi Date: Fri, 10 Apr 2026 13:38:27 +0100 Subject: [PATCH 57/60] More fences around cf splitting --- src/DDC_Modulek.kokkos.cxx | 8 +++++++ src/MatDiagDomk.kokkos.cxx | 13 +++++++++++ src/PMISR_Modulek.kokkos.cxx | 42 ++++++++++++++++++++++++++++++++++++ 3 files changed, 63 insertions(+) diff --git a/src/DDC_Modulek.kokkos.cxx b/src/DDC_Modulek.kokkos.cxx index 13f782dd..2a49c027 100644 --- a/src/DDC_Modulek.kokkos.cxx +++ b/src/DDC_Modulek.kokkos.cxx @@ -69,11 +69,15 @@ PETSC_INTERN void ddc_kokkos(Mat *input_mat, const PetscReal fraction_swap, cons // recompute // ~~~~~~~~~~~~~~~ { + Kokkos::fence(); + // Create measure and cf_markers for Aff PetscScalarKokkosView measure_d("measure_d", local_rows_aff); intKokkosView cf_markers_aff_d("cf_markers_aff_d", local_rows_aff); Kokkos::deep_copy(exec, cf_markers_aff_d, 0); + Kokkos::fence(); + // Copy the random numbers from host to device // These are generated in the Fortran wrapper so CPU and Kokkos use the same randoms PetscScalarKokkosViewHost random_h(random_numbers, local_rows_aff); @@ -84,6 +88,8 @@ PETSC_INTERN void ddc_kokkos(Mat *input_mat, const PetscReal fraction_swap, cons const PetscReal max_scale = std::max(10.0, max_dd_ratio_achieved * 2.0); const PetscReal target_ratio = max_dd_ratio; + Kokkos::fence(); + // Build the measure: // pmisr_existing_measure_cf_markers tags the smallest measure as F points // So we feed in measure = max(10, max_achieved*2) - (diag_dom_ratio - random/1e10) @@ -110,6 +116,8 @@ PETSC_INTERN void ddc_kokkos(Mat *input_mat, const PetscReal fraction_swap, cons check_cf_markers_all_marked_kokkos(cf_markers_aff_d, cf_markers_aff_d.extent(0), MPI_COMM_MATRIX); + Kokkos::fence(); + // Swap F-tagged points back into cf_markers_d Kokkos::parallel_for( Kokkos::RangePolicy<>(exec, 0, local_rows_aff), KOKKOS_LAMBDA(PetscInt i) { diff --git a/src/MatDiagDomk.kokkos.cxx b/src/MatDiagDomk.kokkos.cxx index fd5f4247..ed33bcc9 100644 --- a/src/MatDiagDomk.kokkos.cxx +++ b/src/MatDiagDomk.kokkos.cxx @@ -17,6 +17,8 @@ PETSC_INTERN void MatDiagDomRatio_kokkos(Mat *input_mat, PetscReal *max_dd_ratio PflareKokkosTrace _trace("MatDiagDomRatio_kokkos"); PetscInt local_rows, local_cols; + Kokkos::fence(); + mat_sync(input_mat); // Are we in parallel? @@ -64,6 +66,8 @@ PETSC_INTERN void MatDiagDomRatio_kokkos(Mat *input_mat, PetscReal *max_dd_ratio diag_dom_ratio_local_d = PetscScalarKokkosView("diag_dom_ratio_local_d", local_rows_row); PetscScalarKokkosView diag_dom_ratio_d = diag_dom_ratio_local_d; + Kokkos::fence(); + // ~~~~~~~~~~~~~~~ // Can now go and compute the diagonal dominance sums // ~~~~~~~~~~~~~~~ @@ -111,6 +115,8 @@ PETSC_INTERN void MatDiagDomRatio_kokkos(Mat *input_mat, PetscReal *max_dd_ratio PetscScalarKokkosView diag_entry_d = PetscScalarKokkosView("diag_entry_d", local_rows_row); Kokkos::deep_copy(exec, diag_entry_d, 0); + Kokkos::fence(); + // Scoping to reduce peak memory { // We now go and do a reduce to get the diagonal entry, while also @@ -160,11 +166,13 @@ PETSC_INTERN void MatDiagDomRatio_kokkos(Mat *input_mat, PetscReal *max_dd_ratio diag_dom_ratio_d(i_idx_is_row) = sum_val; }); }); + Kokkos::fence(); } // Finish the in-flight scatter and only then read from the receive buffer. if (mpi) { + Kokkos::fence(); { ConstPetscScalarKokkosView lvec_scalar_d; PetscCallVoid(VecGetKokkosView(scatter_leaf_vec, &lvec_scalar_d)); @@ -234,9 +242,12 @@ PETSC_INTERN void MatDiagDomRatio_kokkos(Mat *input_mat, PetscReal *max_dd_ratio diag_dom_ratio_d(i_idx_is_row) += sum_val; }); }); + Kokkos::fence(); } } + Kokkos::fence(); + // ~~~~~~~~~~~~~ // Compute the diag dominance ratio // ~~~~~~~~~~~~~ @@ -264,6 +275,8 @@ PETSC_INTERN void MatDiagDomRatio_kokkos(Mat *input_mat, PetscReal *max_dd_ratio Kokkos::Max(max_dd_ratio_local) ); + Kokkos::fence(); + PetscCallMPIAbort(MPI_COMM_MATRIX, MPI_Allreduce(&max_dd_ratio_local, max_dd_ratio_achieved, 1, MPI_DOUBLE, MPI_MAX, MPI_COMM_MATRIX)); return; diff --git a/src/PMISR_Modulek.kokkos.cxx b/src/PMISR_Modulek.kokkos.cxx index 9c57682d..9757b4a5 100644 --- a/src/PMISR_Modulek.kokkos.cxx +++ b/src/PMISR_Modulek.kokkos.cxx @@ -591,6 +591,8 @@ PETSC_INTERN void pmisr_existing_measure_implicit_transpose_kokkos(Mat *strength // This returns the global index of the local portion of the matrix PetscCallVoid(MatGetOwnershipRange(*strength_mat, &global_row_start, &global_row_end_plus_one)); + Kokkos::fence(); + // ~~~~~~~~~~~~ // Form the local S+S^T and get CSR pointers // We explicitly compute the local part of S+S^T so we don't have to @@ -623,6 +625,8 @@ PETSC_INTERN void pmisr_existing_measure_implicit_transpose_kokkos(Mat *strength intKokkosView cf_markers_nonlocal_d; PetscScalarKokkosView measure_nonlocal_d; + Kokkos::fence(); + // ~~~~~~~~~~~~~~~ // veto stores whether a node has been veto'd as a candidate // .NOT. veto(i) means the node can be in the set @@ -708,6 +712,8 @@ PETSC_INTERN void pmisr_existing_measure_implicit_transpose_kokkos(Mat *strength } }, counter_in_set_start); + Kokkos::fence(); + // Check the total number of undecided in parallel PetscInt counter_undecided, counter_parallel; if (max_luby_steps < 0) { @@ -736,6 +742,8 @@ PETSC_INTERN void pmisr_existing_measure_implicit_transpose_kokkos(Mat *strength PetscCallVoid(VecDuplicate(mat_mpi->lvec, &scatter_leaf_vec)); } + Kokkos::fence(); + // Let's keep track of how many times we go through the loops int loops_through = -1; do @@ -782,6 +790,8 @@ PETSC_INTERN void pmisr_existing_measure_implicit_transpose_kokkos(Mat *strength } } + Kokkos::fence(); + // ~~~~~~~~ // Now we use veto to keep track of which candidates can be in the set // Locally we know which ones cannot be in the set due to local strong dependencies (mat_local), @@ -844,6 +854,8 @@ PETSC_INTERN void pmisr_existing_measure_implicit_transpose_kokkos(Mat *strength } }); + Kokkos::fence(); + // ~~~~~~~~ // Now let's go through and veto candidates which have strong influences on this rank // ie non-local nodes that influence local nodes through S^T @@ -896,6 +908,8 @@ PETSC_INTERN void pmisr_existing_measure_implicit_transpose_kokkos(Mat *strength }); } + Kokkos::fence(); + // Reduce the vetos with a lor via VecScatter ADD_VALUES SCATTER_REVERSE // (LOR is equivalent to sum when values are 0/1 bools) { @@ -930,6 +944,8 @@ PETSC_INTERN void pmisr_existing_measure_implicit_transpose_kokkos(Mat *strength PetscCallVoid(VecRestoreKokkosView(scatter_root_vec, &root_scalar_d)); } + Kokkos::fence(); + // Now the comms have finished, we know exactly which local nodes on this rank have no // local strong dependencies, influences, non-local influences but not yet non-local dependencies // Let's do the non-local dependencies and then now that the comms are done on veto_local_d @@ -970,6 +986,8 @@ PETSC_INTERN void pmisr_existing_measure_implicit_transpose_kokkos(Mat *strength }); } }); + + Kokkos::fence(); } // This cf_markers_d(i) = loops_through happens above in the case of mpi, saves a kernel launch else @@ -981,6 +999,8 @@ PETSC_INTERN void pmisr_existing_measure_implicit_transpose_kokkos(Mat *strength if (!veto_local_d(i)) cf_markers_d(i) = loops_through; }); + + Kokkos::fence(); } // ~~~~~~~~~~~~~ @@ -1023,10 +1043,14 @@ PETSC_INTERN void pmisr_existing_measure_implicit_transpose_kokkos(Mat *strength } }); + Kokkos::fence(); + // We use the veto arrays here to do this comms Kokkos::deep_copy(exec, veto_nonlocal_d, false); Kokkos::deep_copy(exec, veto_local_d, false); + Kokkos::fence(); + // Set non-local strong dependencies Kokkos::parallel_for( Kokkos::TeamPolicy<>(exec, local_rows, Kokkos::AUTO()), @@ -1051,6 +1075,8 @@ PETSC_INTERN void pmisr_existing_measure_implicit_transpose_kokkos(Mat *strength } }); + Kokkos::fence(); + // Reduce the veto_nonlocal_d with a lor via VecScatter ADD_VALUES SCATTER_REVERSE // Any local node with veto set to true is not in the set { @@ -1085,6 +1111,8 @@ PETSC_INTERN void pmisr_existing_measure_implicit_transpose_kokkos(Mat *strength PetscCallVoid(VecRestoreKokkosView(scatter_root_vec, &root_scalar_d)); } + Kokkos::fence(); + // Let's finish the non-local dependencies // If this node has been veto'd, then set it to not in the set Kokkos::parallel_for( @@ -1094,6 +1122,8 @@ PETSC_INTERN void pmisr_existing_measure_implicit_transpose_kokkos(Mat *strength } }); + Kokkos::fence(); + // Now that non-local dependencies are marked, broadcast the cf_markers so that // on other ranks we know which nodes have cf_markers_nonlocal_d(i) == loops_through // (i.e. which nonlocal nodes were assigned to the IS this iteration). @@ -1121,6 +1151,8 @@ PETSC_INTERN void pmisr_existing_measure_implicit_transpose_kokkos(Mat *strength PetscCallVoid(VecRestoreKokkosView(scatter_leaf_vec, &leaf_scalar_d)); } + Kokkos::fence(); + // And now we have the information we need to set any of the non-local influences if (mat_nonlocal_transpose != NULL) { @@ -1148,6 +1180,7 @@ PETSC_INTERN void pmisr_existing_measure_implicit_transpose_kokkos(Mat *strength }); } }); + Kokkos::fence(); } } else @@ -1181,6 +1214,7 @@ PETSC_INTERN void pmisr_existing_measure_implicit_transpose_kokkos(Mat *strength }); } }); + Kokkos::fence(); } // We've done another top level loop @@ -1205,6 +1239,8 @@ PETSC_INTERN void pmisr_existing_measure_implicit_transpose_kokkos(Mat *strength Kokkos::fence(); } + Kokkos::fence(); + } while (counter_undecided != 0); @@ -1216,6 +1252,8 @@ PETSC_INTERN void pmisr_existing_measure_implicit_transpose_kokkos(Mat *strength if (destroy_spst) PetscCallVoid(MatDestroy(&mat_local_spst)); if (destroy_nonlocal_transpose) PetscCallVoid(MatDestroy(&mat_nonlocal_transpose)); + Kokkos::fence(); + // ~~~~~~~~~ // Now assign our final cf markers // ~~~~~~~~~ @@ -1299,6 +1337,8 @@ PETSC_INTERN void pmisr_kokkos(Mat *strength_mat, const int max_luby_steps, cons auto exec = PetscGetKokkosExecutionSpace(); + Kokkos::fence(); + // If you want to generate the randoms on the device //Kokkos::Random_XorShift64_Pool<> random_pool(/*seed=*/12345); // Copy the input measure from host to device @@ -1307,6 +1347,8 @@ PETSC_INTERN void pmisr_kokkos(Mat *strength_mat, const int max_luby_steps, cons size_t bytes = measure_local_h.extent(0) * sizeof(PetscReal); PetscCallVoid(PetscLogCpuToGpu(bytes)); + Kokkos::fence(); + // Compute the measure Kokkos::parallel_for( Kokkos::RangePolicy<>(exec, 0, local_rows), KOKKOS_LAMBDA(PetscInt i) { From 762990312e26310b4e3d5d51a4730b69f1bb0550 Mon Sep 17 00:00:00 2001 From: sdargavi Date: Fri, 10 Apr 2026 18:10:25 +0100 Subject: [PATCH 58/60] Disable prints --- src/DDC_Modulek.kokkos.cxx | 2 +- src/Device_Datak.kokkos.cxx | 12 ++-- src/Gmres_Polyk.kokkos.cxx | 2 +- src/Grid_Transferk.kokkos.cxx | 6 +- src/MatDiagDomk.kokkos.cxx | 2 +- src/PETSc_Helperk.kokkos.cxx | 122 ++++++++++++++++----------------- src/PMISR_Modulek.kokkos.cxx | 6 +- src/SAI_Zk.kokkos.cxx | 2 +- src/VecISCopyLocalk.kokkos.cxx | 8 +-- 9 files changed, 81 insertions(+), 81 deletions(-) diff --git a/src/DDC_Modulek.kokkos.cxx b/src/DDC_Modulek.kokkos.cxx index 2a49c027..2ebc7f3b 100644 --- a/src/DDC_Modulek.kokkos.cxx +++ b/src/DDC_Modulek.kokkos.cxx @@ -15,7 +15,7 @@ // You have to explicitly call copy_cf_markers_d2h(cf_markers_local) to do this PETSC_INTERN void ddc_kokkos(Mat *input_mat, const PetscReal fraction_swap, const PetscReal max_dd_ratio, const PetscReal max_dd_ratio_achieved, Mat *aff, PetscReal *random_numbers) { - PflareKokkosTrace _trace("ddc_kokkos"); + //PflareKokkosTrace _trace("ddc_kokkos"); // Can't use the global directly within the parallel // regions on the device intKokkosView cf_markers_d = cf_markers_local_d; diff --git a/src/Device_Datak.kokkos.cxx b/src/Device_Datak.kokkos.cxx index 811117b6..986a4846 100644 --- a/src/Device_Datak.kokkos.cxx +++ b/src/Device_Datak.kokkos.cxx @@ -15,7 +15,7 @@ PetscScalarKokkosView diag_dom_ratio_local_d; // Copy the global cf_markers_local_d back to the host PETSC_INTERN void copy_cf_markers_d2h(int *cf_markers_local) { - PflareKokkosTrace _trace("copy_cf_markers_d2h"); + //PflareKokkosTrace _trace("copy_cf_markers_d2h"); // Host wrapper for cf_markers_local intKokkosViewHost cf_markers_local_h(cf_markers_local, cf_markers_local_d.extent(0)); @@ -37,7 +37,7 @@ PETSC_INTERN void copy_cf_markers_d2h(int *cf_markers_local) // Copy the global diag_dom_ratio_local_d back to the host PETSC_INTERN void copy_diag_dom_ratio_d2h(PetscReal *diag_dom_ratio_local) { - PflareKokkosTrace _trace("copy_diag_dom_ratio_d2h"); + //PflareKokkosTrace _trace("copy_diag_dom_ratio_d2h"); // Host wrapper for diag_dom_ratio_local PetscScalarKokkosViewHost diag_dom_ratio_h(diag_dom_ratio_local, diag_dom_ratio_local_d.extent(0)); @@ -59,7 +59,7 @@ PETSC_INTERN void copy_diag_dom_ratio_d2h(PetscReal *diag_dom_ratio_local) // Delete the global cf_markers_local_d PETSC_INTERN void delete_device_cf_markers() { - PflareKokkosTrace _trace("delete_device_cf_markers"); + //PflareKokkosTrace _trace("delete_device_cf_markers"); // Delete the device view - this assigns an empty view // and hence the old view has its ref counter decremented cf_markers_local_d = intKokkosView(); @@ -72,7 +72,7 @@ PETSC_INTERN void delete_device_cf_markers() // Delete the global diag_dom_ratio_local_d PETSC_INTERN void delete_device_diag_dom_ratio() { - PflareKokkosTrace _trace("delete_device_diag_dom_ratio"); + //PflareKokkosTrace _trace("delete_device_diag_dom_ratio"); // Delete the device view - this assigns an empty view // and hence the old view has its ref counter decremented diag_dom_ratio_local_d = PetscScalarKokkosView(); @@ -85,7 +85,7 @@ PETSC_INTERN void delete_device_diag_dom_ratio() // Creates the device local indices for F or C points based on the global cf_markers_local_d PETSC_INTERN void create_cf_is_device_kokkos(Mat *input_mat, const int match_cf, PetscIntKokkosView &is_local_d) { - PflareKokkosTrace _trace("create_cf_is_device_kokkos"); + //PflareKokkosTrace _trace("create_cf_is_device_kokkos"); PetscInt local_rows, local_cols; PetscCallVoid(MatGetLocalSize(*input_mat, &local_rows, &local_cols)); auto exec = PetscGetKokkosExecutionSpace(); @@ -146,7 +146,7 @@ PETSC_INTERN void create_cf_is_device_kokkos(Mat *input_mat, const int match_cf, // Creates the host IS is_fine and is_coarse based on the global cf_markers_local_d PETSC_INTERN void create_cf_is_kokkos(Mat *input_mat, IS *is_fine, IS *is_coarse) { - PflareKokkosTrace _trace("create_cf_is_kokkos"); + //PflareKokkosTrace _trace("create_cf_is_kokkos"); PetscIntKokkosView is_fine_local_d, is_coarse_local_d; MPI_Comm MPI_COMM_MATRIX; PetscCallVoid(PetscObjectGetComm((PetscObject)*input_mat, &MPI_COMM_MATRIX)); diff --git a/src/Gmres_Polyk.kokkos.cxx b/src/Gmres_Polyk.kokkos.cxx index ef133a48..c51facac 100644 --- a/src/Gmres_Polyk.kokkos.cxx +++ b/src/Gmres_Polyk.kokkos.cxx @@ -8,7 +8,7 @@ PETSC_INTERN void mat_mult_powers_share_sparsity_kokkos(Mat *input_mat, const int poly_order, const int poly_sparsity_order, PetscReal *coefficients, \ const int reuse_int_reuse_mat, Mat *reuse_mat, const int reuse_int_cmat, Mat *output_mat) { - PflareKokkosTrace _trace("mat_mult_powers_share_sparsity_kokkos"); + //PflareKokkosTrace _trace("mat_mult_powers_share_sparsity_kokkos"); MPI_Comm MPI_COMM_MATRIX; PetscInt local_rows, local_cols; PetscInt global_row_start_temp, global_row_end_plus_one_temp; diff --git a/src/Grid_Transferk.kokkos.cxx b/src/Grid_Transferk.kokkos.cxx index 529e6fd8..7746df2e 100644 --- a/src/Grid_Transferk.kokkos.cxx +++ b/src/Grid_Transferk.kokkos.cxx @@ -7,7 +7,7 @@ // Generate one point classical prolongator but with kokkos - keeping everything on the device PETSC_INTERN void generate_one_point_with_one_entry_from_sparse_kokkos(Mat *input_mat, Mat *output_mat) { - PflareKokkosTrace _trace("generate_one_point_with_one_entry_from_sparse_kokkos"); + //PflareKokkosTrace _trace("generate_one_point_with_one_entry_from_sparse_kokkos"); MPI_Comm MPI_COMM_MATRIX; PetscInt local_rows, local_cols, global_rows, global_cols; PetscInt global_row_start, global_row_end_plus_one; @@ -309,7 +309,7 @@ PETSC_INTERN void generate_one_point_with_one_entry_from_sparse_kokkos(Mat *inpu PETSC_INTERN void compute_P_from_W_kokkos(Mat *input_mat, PetscInt global_row_start, IS *is_fine, \ IS *is_coarse, int identity_int, int reuse_int, Mat *output_mat) { - PflareKokkosTrace _trace("compute_P_from_W_kokkos"); + //PflareKokkosTrace _trace("compute_P_from_W_kokkos"); MPI_Comm MPI_COMM_MATRIX; PetscInt global_row_start_W, global_row_end_plus_one_W; PetscInt global_col_start_W, global_col_end_plus_one_W; @@ -737,7 +737,7 @@ PETSC_INTERN void compute_R_from_Z_kokkos(Mat *input_mat, PetscInt global_row_st IS *is_coarse, IS *orig_fine_col_indices, int identity_int, int reuse_int, int reuse_indices_int, \ Mat *output_mat) { - PflareKokkosTrace _trace("compute_R_from_Z_kokkos"); + //PflareKokkosTrace _trace("compute_R_from_Z_kokkos"); MPI_Comm MPI_COMM_MATRIX; PetscInt global_row_start_Z, global_row_end_plus_one_Z; PetscInt global_col_start_Z, global_col_end_plus_one_Z; diff --git a/src/MatDiagDomk.kokkos.cxx b/src/MatDiagDomk.kokkos.cxx index ed33bcc9..7eaebcbb 100644 --- a/src/MatDiagDomk.kokkos.cxx +++ b/src/MatDiagDomk.kokkos.cxx @@ -14,7 +14,7 @@ // This code is very similar to MatCreateSubMatrix_kokkos PETSC_INTERN void MatDiagDomRatio_kokkos(Mat *input_mat, PetscReal *max_dd_ratio_achieved, PetscInt *local_rows_aff) { - PflareKokkosTrace _trace("MatDiagDomRatio_kokkos"); + //PflareKokkosTrace _trace("MatDiagDomRatio_kokkos"); PetscInt local_rows, local_cols; Kokkos::fence(); diff --git a/src/PETSc_Helperk.kokkos.cxx b/src/PETSc_Helperk.kokkos.cxx index a101e312..c62a2df1 100644 --- a/src/PETSc_Helperk.kokkos.cxx +++ b/src/PETSc_Helperk.kokkos.cxx @@ -11,7 +11,7 @@ static PetscErrorCode check_exact_petscint_to_scalar_encoding(PetscInt max_encoded_value, MPI_Comm comm) { PetscFunctionBegin; - PflareKokkosTrace _trace("check_exact_petscint_to_scalar_encoding"); + //PflareKokkosTrace _trace("check_exact_petscint_to_scalar_encoding"); if (max_encoded_value <= 0) PetscFunctionReturn(PETSC_SUCCESS); const int digits = std::numeric_limits::digits; @@ -29,7 +29,7 @@ static PetscErrorCode check_exact_petscint_to_scalar_encoding(PetscInt max_encod // Sync the kokkos parts of the matrix to make sure they're up to date PETSC_INTERN void mat_sync(Mat *X) { - PflareKokkosTrace _trace("mat_sync"); + //PflareKokkosTrace _trace("mat_sync"); MatType mat_type; PetscCallVoid(MatGetType(*X, &mat_type)); // Are we in parallel? @@ -70,7 +70,7 @@ PETSC_INTERN void mat_sync(Mat *X) // Fences internally. static void remap_j_to_local_device(PetscIntKokkosView j_d, PetscIntKokkosView garray_d, PetscInt col_ao_output) { - PflareKokkosTrace _trace("remap_j_to_local_device"); + //PflareKokkosTrace _trace("remap_j_to_local_device"); auto exec = PetscGetKokkosExecutionSpace(); if (j_d.extent(0) == 0) return; @@ -87,7 +87,7 @@ static void remap_j_to_local_device(PetscIntKokkosView j_d, PetscIntKokkosView g // garray_d (out) is a device view of the sorted unique global column indices (size col_ao_output). static void rewrite_j_global_to_local_device(PetscInt colmap_max_size, PetscInt &col_ao_output, PetscIntKokkosView j_nonlocal_d, PetscIntKokkosView &garray_d) { - PflareKokkosTrace _trace("rewrite_j_global_to_local_device"); + //PflareKokkosTrace _trace("rewrite_j_global_to_local_device"); auto exec = PetscGetKokkosExecutionSpace(); // Need to preallocate to the max size @@ -126,7 +126,7 @@ static void rewrite_j_global_to_local_device(PetscInt colmap_max_size, PetscInt // Generate the colmap and rewrite input global j indices to local given the calculated colmap PETSC_INTERN void rewrite_j_global_to_local(PetscInt colmap_max_size, PetscInt &col_ao_output, PetscIntKokkosView j_nonlocal_d, PetscInt **garray_host) { - PflareKokkosTrace _trace("rewrite_j_global_to_local"); + //PflareKokkosTrace _trace("rewrite_j_global_to_local"); auto exec = PetscGetKokkosExecutionSpace(); PetscIntKokkosView garray_d; @@ -154,7 +154,7 @@ PETSC_INTERN void remove_small_from_sparse_kokkos(Mat *input_mat, const PetscRea const int relative_max_row_tolerance_int, const int lump_int, \ const int allow_drop_diagonal_int, const int allow_diag_strength_int) { - PflareKokkosTrace _trace("remove_small_from_sparse_kokkos"); + //PflareKokkosTrace _trace("remove_small_from_sparse_kokkos"); MPI_Comm MPI_COMM_MATRIX; PetscInt local_rows, local_cols, global_rows, global_cols; PetscInt global_row_start_temp, global_row_end_plus_one_temp; @@ -883,7 +883,7 @@ PETSC_INTERN void remove_small_from_sparse_kokkos(Mat *input_mat, const PetscRea // Drop according to a existing sparsity in output_mat but with kokkos - keeping everything on the device PETSC_INTERN void remove_from_sparse_match_kokkos(Mat *input_mat, Mat *output_mat, const int lump_int, const int alpha_int, const PetscReal alpha) { - PflareKokkosTrace _trace("remove_from_sparse_match_kokkos"); + //PflareKokkosTrace _trace("remove_from_sparse_match_kokkos"); MPI_Comm MPI_COMM_MATRIX; PetscInt local_rows, local_cols, global_rows, global_cols; PetscInt global_row_start_temp, global_row_end_plus_one_temp; @@ -1231,7 +1231,7 @@ PETSC_INTERN void remove_from_sparse_match_kokkos(Mat *input_mat, Mat *output_ma // Set all the values of the matrix to val PETSC_INTERN void MatSetAllValues_kokkos(Mat *input_mat, PetscReal val) { - PflareKokkosTrace _trace("MatSetAllValues_kokkos"); + //PflareKokkosTrace _trace("MatSetAllValues_kokkos"); MatType mat_type; PetscCallVoid(MatGetType(*input_mat, &mat_type)); @@ -1306,7 +1306,7 @@ PETSC_INTERN void MatSetAllValues_kokkos(Mat *input_mat, PetscReal val) // Duplicate and copy a matrix ensuring it always has a diagonal but with kokkos - keeping everything on the device PETSC_INTERN void mat_duplicate_copy_plus_diag_kokkos(Mat *input_mat, const int reuse_int, Mat *output_mat) { - PflareKokkosTrace _trace("mat_duplicate_copy_plus_diag_kokkos"); + //PflareKokkosTrace _trace("mat_duplicate_copy_plus_diag_kokkos"); MPI_Comm MPI_COMM_MATRIX; PetscInt global_row_start_temp, global_row_end_plus_one_temp; PetscInt global_col_start_temp, global_col_end_plus_one_temp; @@ -1718,7 +1718,7 @@ PETSC_INTERN void mat_duplicate_copy_plus_diag_kokkos(Mat *input_mat, const int // Does a MatAXPY for a MPIAIJ Kokkos matrix - the petsc version currently uses the host making it very slow PETSC_INTERN void MatAXPY_kokkos(Mat *Y, PetscScalar alpha, Mat *X) { - PflareKokkosTrace _trace("MatAXPY_kokkos"); + //PflareKokkosTrace _trace("MatAXPY_kokkos"); Mat mat_local_y = NULL, mat_nonlocal_y = NULL; Mat mat_local_x = NULL, mat_nonlocal_x = NULL; @@ -1963,7 +1963,7 @@ PETSC_INTERN void MatAXPY_kokkos(Mat *Y, PetscScalar alpha, Mat *X) // is_col must be sorted PETSC_INTERN void MatCreateSubMatrix_Seq_kokkos(Mat *input_mat, PetscIntKokkosView &is_row_d_d, PetscIntKokkosView &is_col_d_d, const int reuse_int, Mat *output_mat) { - PflareKokkosTrace _trace("MatCreateSubMatrix_Seq_kokkos"); + //PflareKokkosTrace _trace("MatCreateSubMatrix_Seq_kokkos"); PetscInt local_rows, local_cols; PetscInt nnzs_match_local; auto exec = PetscGetKokkosExecutionSpace(); @@ -2426,7 +2426,7 @@ PETSC_INTERN void MatCreateSubMatrix_Seq_kokkos(Mat *input_mat, PetscIntKokkosVi PETSC_INTERN void MatCreateSubMatrix_kokkos_view(Mat *input_mat, PetscIntKokkosView is_row_d_d, PetscInt global_rows_row, \ PetscIntKokkosView is_col_d_d, PetscInt global_cols_col, const int reuse_int, Mat *output_mat, IS *rows_rows, IS *cols_cols) { -// PflareKokkosTrace _trace("MatCreateSubMatrix_kokkos_view"); +// //PflareKokkosTrace _trace("MatCreateSubMatrix_kokkos_view"); // PetscInt local_rows, local_cols; // PetscInt global_rows, global_cols; // PetscInt global_row_start, global_row_end_plus_one; @@ -2874,7 +2874,7 @@ PETSC_INTERN void MatCreateSubMatrix_kokkos(Mat *input_mat, IS *is_row, IS *is_c const int reuse_int, Mat *output_mat, \ const int our_level, const int is_row_fine_int, const int is_col_fine_int) { - PflareKokkosTrace _trace("MatCreateSubMatrix_kokkos"); + //PflareKokkosTrace _trace("MatCreateSubMatrix_kokkos"); PetscInt global_row_start, global_row_end_plus_one; PetscInt global_col_start, global_col_end_plus_one; @@ -2891,54 +2891,54 @@ PETSC_INTERN void MatCreateSubMatrix_kokkos(Mat *input_mat, IS *is_row, IS *is_c const int level_idx = our_level - 1; auto exec = PetscGetKokkosExecutionSpace(); - // If we want the input is_row and is_col to be used - if (our_level == -1) - { - // Get pointers to the indices on the host - const PetscInt *is_row_indices_ptr, *is_col_indices_ptr; - PetscCallVoid(ISGetIndices(*is_row, &is_row_indices_ptr)); - PetscCallVoid(ISGetIndices(*is_col, &is_col_indices_ptr)); - - PetscInt local_rows_row, local_cols_col; - PetscCallVoid(ISGetLocalSize(*is_row, &local_rows_row)); - PetscCallVoid(ISGetLocalSize(*is_col, &local_cols_col)); - - // Create a host view of the existing indices - auto is_row_view_h = PetscIntConstKokkosViewHost(is_row_indices_ptr, local_rows_row); - is_row_d_d = PetscIntKokkosView("is_row_d_d", local_rows_row); - auto is_col_view_h = PetscIntConstKokkosViewHost(is_col_indices_ptr, local_cols_col); - is_col_d_d = PetscIntKokkosView("is_col_d_d", local_cols_col); - // Copy indices to the device - Kokkos::deep_copy(exec, is_row_d_d, is_row_view_h); - Kokkos::deep_copy(exec, is_col_d_d, is_col_view_h); - // The source pointers come from ISGetIndices; ensure async copies complete - // before restoring those host buffers. - Kokkos::fence(); - // Log copy with petsc - size_t bytes = is_row_view_h.extent(0) * sizeof(PetscInt); - PetscCallVoid(PetscLogCpuToGpu(bytes)); - bytes = is_col_view_h.extent(0) * sizeof(PetscInt); - PetscCallVoid(PetscLogCpuToGpu(bytes)); - - PetscCallVoid(ISRestoreIndices(*is_row, &is_row_indices_ptr)); - PetscCallVoid(ISRestoreIndices(*is_col, &is_col_indices_ptr)); - - // ~~~~~~~~~~~~ - // Rewrite to local indices - // ~~~~~~~~~~~~ - Kokkos::parallel_for( - Kokkos::RangePolicy<>(exec, 0, is_row_d_d.extent(0)), KOKKOS_LAMBDA(PetscInt i) { - - is_row_d_d(i) -= global_row_start; // Make local - }); - - Kokkos::parallel_for( - Kokkos::RangePolicy<>(exec, 0, is_col_d_d.extent(0)), KOKKOS_LAMBDA(PetscInt i) { - - is_col_d_d(i) -= global_col_start; // Make local - }); - Kokkos::fence(); - } + // // If we want the input is_row and is_col to be used + // if (our_level == -1) + // { + // // Get pointers to the indices on the host + // const PetscInt *is_row_indices_ptr, *is_col_indices_ptr; + // PetscCallVoid(ISGetIndices(*is_row, &is_row_indices_ptr)); + // PetscCallVoid(ISGetIndices(*is_col, &is_col_indices_ptr)); + + // PetscInt local_rows_row, local_cols_col; + // PetscCallVoid(ISGetLocalSize(*is_row, &local_rows_row)); + // PetscCallVoid(ISGetLocalSize(*is_col, &local_cols_col)); + + // // Create a host view of the existing indices + // auto is_row_view_h = PetscIntConstKokkosViewHost(is_row_indices_ptr, local_rows_row); + // is_row_d_d = PetscIntKokkosView("is_row_d_d", local_rows_row); + // auto is_col_view_h = PetscIntConstKokkosViewHost(is_col_indices_ptr, local_cols_col); + // is_col_d_d = PetscIntKokkosView("is_col_d_d", local_cols_col); + // // Copy indices to the device + // Kokkos::deep_copy(exec, is_row_d_d, is_row_view_h); + // Kokkos::deep_copy(exec, is_col_d_d, is_col_view_h); + // // The source pointers come from ISGetIndices; ensure async copies complete + // // before restoring those host buffers. + // Kokkos::fence(); + // // Log copy with petsc + // size_t bytes = is_row_view_h.extent(0) * sizeof(PetscInt); + // PetscCallVoid(PetscLogCpuToGpu(bytes)); + // bytes = is_col_view_h.extent(0) * sizeof(PetscInt); + // PetscCallVoid(PetscLogCpuToGpu(bytes)); + + // PetscCallVoid(ISRestoreIndices(*is_row, &is_row_indices_ptr)); + // PetscCallVoid(ISRestoreIndices(*is_col, &is_col_indices_ptr)); + + // // ~~~~~~~~~~~~ + // // Rewrite to local indices + // // ~~~~~~~~~~~~ + // Kokkos::parallel_for( + // Kokkos::RangePolicy<>(exec, 0, is_row_d_d.extent(0)), KOKKOS_LAMBDA(PetscInt i) { + + // is_row_d_d(i) -= global_row_start; // Make local + // }); + + // Kokkos::parallel_for( + // Kokkos::RangePolicy<>(exec, 0, is_col_d_d.extent(0)), KOKKOS_LAMBDA(PetscInt i) { + + // is_col_d_d(i) -= global_col_start; // Make local + // }); + // Kokkos::fence(); + // } // // Instead if we tell the routine that the is_row and is_col are fine/coarse local indices // // that already are on the device // else diff --git a/src/PMISR_Modulek.kokkos.cxx b/src/PMISR_Modulek.kokkos.cxx index 9757b4a5..a9b23b0d 100644 --- a/src/PMISR_Modulek.kokkos.cxx +++ b/src/PMISR_Modulek.kokkos.cxx @@ -15,7 +15,7 @@ // This mirrors the CPU version pmisr_existing_measure_cf_markers in PMISR_Module.F90 PETSC_INTERN void pmisr_existing_measure_cf_markers_kokkos(Mat *strength_mat, const int max_luby_steps, const int pmis_int, PetscScalarKokkosView &measure_local_d, intKokkosView &cf_markers_d, const int zero_measure_c_point_int) { - PflareKokkosTrace _trace("pmisr_existing_measure_cf_markers_kokkos"); + //PflareKokkosTrace _trace("pmisr_existing_measure_cf_markers_kokkos"); MPI_Comm MPI_COMM_MATRIX; PetscInt local_rows, local_cols, global_rows, global_cols; @@ -529,7 +529,7 @@ PETSC_INTERN void pmisr_existing_measure_cf_markers_kokkos(Mat *strength_mat, co // See the full comments in the CPU version pmisr_existing_measure_implicit_transpose PETSC_INTERN void pmisr_existing_measure_implicit_transpose_kokkos(Mat *strength_mat, const int max_luby_steps, const int pmis_int, PetscScalarKokkosView &measure_local_d, intKokkosView &cf_markers_d, const int zero_measure_c_point_int) { - PflareKokkosTrace _trace("pmisr_existing_measure_implicit_transpose_kokkos"); + //PflareKokkosTrace _trace("pmisr_existing_measure_implicit_transpose_kokkos"); MPI_Comm MPI_COMM_MATRIX; PetscInt local_rows, local_cols, global_rows, global_cols; @@ -1287,7 +1287,7 @@ PETSC_INTERN void pmisr_existing_measure_implicit_transpose_kokkos(Mat *strength // You have to explicitly call copy_cf_markers_d2h(cf_markers_local) to do this PETSC_INTERN void pmisr_kokkos(Mat *strength_mat, const int max_luby_steps, const int pmis_int, PetscReal *measure_local, const int zero_measure_c_point_int) { - PflareKokkosTrace _trace("pmisr_kokkos"); + //PflareKokkosTrace _trace("pmisr_kokkos"); MPI_Comm MPI_COMM_MATRIX; PetscInt local_rows, local_cols, global_rows, global_cols; diff --git a/src/SAI_Zk.kokkos.cxx b/src/SAI_Zk.kokkos.cxx index dc40190b..d9da1994 100644 --- a/src/SAI_Zk.kokkos.cxx +++ b/src/SAI_Zk.kokkos.cxx @@ -13,7 +13,7 @@ PETSC_INTERN void calculate_and_build_sai_z_kokkos(Mat *A_ff, Mat *A_cf, Mat *sparsity_mat_cf, const int reuse_int_reuse_mat, Mat *reuse_mat, Mat *z_mat) { - PflareKokkosTrace _trace("calculate_and_build_sai_z_kokkos"); + //PflareKokkosTrace _trace("calculate_and_build_sai_z_kokkos"); MPI_Comm MPI_COMM_MATRIX; PetscInt local_rows_cf, local_cols_cf; PetscInt local_rows_ff, local_cols_ff; diff --git a/src/VecISCopyLocalk.kokkos.cxx b/src/VecISCopyLocalk.kokkos.cxx index 31652ec7..32bfa24a 100644 --- a/src/VecISCopyLocalk.kokkos.cxx +++ b/src/VecISCopyLocalk.kokkos.cxx @@ -13,7 +13,7 @@ int max_levels = -1; // Destroys the data PETSC_INTERN void destroy_VecISCopyLocal_kokkos() { - PflareKokkosTrace _trace("destroy_VecISCopyLocal_kokkos"); + //PflareKokkosTrace _trace("destroy_VecISCopyLocal_kokkos"); if (IS_fine_views_local) { // Will automatically call the destructor on each element delete[] IS_fine_views_local; @@ -32,7 +32,7 @@ PETSC_INTERN void destroy_VecISCopyLocal_kokkos() // Creates the data we need to do the equivalent of veciscopy on local data in kokkos PETSC_INTERN void create_VecISCopyLocal_kokkos(int max_levels_input) { - PflareKokkosTrace _trace("create_VecISCopyLocal_kokkos"); + //PflareKokkosTrace _trace("create_VecISCopyLocal_kokkos"); // If not built if (!IS_fine_views_local) { @@ -67,7 +67,7 @@ PETSC_INTERN void create_VecISCopyLocal_kokkos(int max_levels_input) // Copy the input IS's to the device for our_level PETSC_INTERN void set_VecISCopyLocal_kokkos_our_level(int our_level, PetscInt global_row_start, IS *index_fine, IS *index_coarse) { - PflareKokkosTrace _trace("set_VecISCopyLocal_kokkos_our_level"); + //PflareKokkosTrace _trace("set_VecISCopyLocal_kokkos_our_level"); auto exec = PetscGetKokkosExecutionSpace(); const int level_idx = our_level - 1; @@ -133,7 +133,7 @@ PETSC_INTERN void set_VecISCopyLocal_kokkos_our_level(int our_level, PetscInt gl // Do the equivalent of veciscopy on local data using the IS data on the device PETSC_INTERN void VecISCopyLocal_kokkos(int our_level, int fine_int, Vec *vfull, int mode_int, Vec *vreduced) { - PflareKokkosTrace _trace("VecISCopyLocal_kokkos"); + //PflareKokkosTrace _trace("VecISCopyLocal_kokkos"); Kokkos::fence(); const int level_idx = our_level - 1; From 367c46553208e4c85e34b1d2ec69c5195d8ed854 Mon Sep 17 00:00:00 2001 From: sdargavi Date: Fri, 10 Apr 2026 18:39:58 +0100 Subject: [PATCH 59/60] Disable more printing --- include/kokkos_helper.hpp | 15 ++++++++------- src/AIR_MG_Setup.F90 | 12 ++++++------ src/AIR_Operators_Setup.F90 | 4 ++-- src/DDC_Modulek.kokkos.cxx | 4 ++-- src/Device_Datak.kokkos.cxx | 2 +- src/Gmres_Poly.F90 | 8 ++++---- src/PMISR_Modulek.kokkos.cxx | 2 +- 7 files changed, 24 insertions(+), 23 deletions(-) diff --git a/include/kokkos_helper.hpp b/include/kokkos_helper.hpp index 9c173b40..b8e6a3e9 100644 --- a/include/kokkos_helper.hpp +++ b/include/kokkos_helper.hpp @@ -189,13 +189,14 @@ inline void check_cf_markers_all_marked_kokkos( rank, (int)bad_count, (int)local_rows); fflush(stderr); MPI_Abort(MPI_COMM_MATRIX, 1); - } else { - fprintf(stderr, - "[PFLARE kokkos rank=%d] check_cf_markers_all_marked_kokkos: " - "all %d local points marked F or C OK\n", - rank, (int)local_rows); - fflush(stderr); - } + } + // else { + // fprintf(stderr, + // "[PFLARE kokkos rank=%d] check_cf_markers_all_marked_kokkos: " + // "all %d local points marked F or C OK\n", + // rank, (int)local_rows); + // fflush(stderr); + // } } // Check that is_fine_local_d and is_coarse_local_d together cover every local diff --git a/src/AIR_MG_Setup.F90 b/src/AIR_MG_Setup.F90 index 2c12f1c1..ccf705d0 100644 --- a/src/AIR_MG_Setup.F90 +++ b/src/AIR_MG_Setup.F90 @@ -157,7 +157,7 @@ subroutine setup_air_pcmg(amat, pmat, air_data, pcmg_input) our_level .ge. air_data%options%auto_truncate_start_level .AND. & air_data%options%auto_truncate_start_level /= -1) then - print *, "starting auto truncate check on level ", our_level + !print *, "starting auto truncate check on level ", our_level call timer_start(TIMER_ID_AIR_TRUNCATE) @@ -170,7 +170,7 @@ subroutine setup_air_pcmg(amat, pmat, air_data, pcmg_input) proc_stride, & air_data%inv_coarsest_poly_data) - print *, "starting approx inverse ", our_level + !print *, "starting approx inverse ", our_level ! Start the approximate inverse we'll use on this level call start_approximate_inverse(air_data%coarse_matrix(our_level), & @@ -193,7 +193,7 @@ subroutine setup_air_pcmg(amat, pmat, air_data, pcmg_input) call VecDuplicate(rand_vec, sol_vec, ierr) call VecDuplicate(rand_vec, temp_vec, ierr) - print *, "starting finish approx inverse ", our_level + !print *, "starting finish approx inverse ", our_level ! Finish our approximate inverse call finish_approximate_inverse(air_data%coarse_matrix(our_level), & @@ -215,7 +215,7 @@ subroutine setup_air_pcmg(amat, pmat, air_data, pcmg_input) air_data%inv_coarsest_poly_data%inverse_type == PFLAREINV_NEWTON_NO_EXTRA) .AND. & air_data%options%coarsest_matrix_free_polys) then - print *, "starting matvecs residual ", our_level + !print *, "starting matvecs residual ", our_level if (air_data%options%coarsest_diag_scale_polys) then call petsc_matvec_right_scale_poly_newton_residual_mf(air_data%inv_A_ff(our_level), rand_vec, temp_vec) @@ -233,7 +233,7 @@ subroutine setup_air_pcmg(amat, pmat, air_data, pcmg_input) call VecAXPY(temp_vec, -1d0, rand_vec, ierr) end if - print *, "computing norms ", our_level + !print *, "computing norms ", our_level ! Get the achieved norm call VecNorm(temp_vec, NORM_2, achieved_rel_tol, ierr) @@ -263,7 +263,7 @@ subroutine setup_air_pcmg(amat, pmat, air_data, pcmg_input) call timer_finish(TIMER_ID_AIR_TRUNCATE) end if - print *, "starting cf splitting ", our_level + !print *, "starting cf splitting ", our_level ! ~~~~~~~~~~~~ ! Compute the coarsening diff --git a/src/AIR_Operators_Setup.F90 b/src/AIR_Operators_Setup.F90 index 71c552b4..e8ec2f2c 100644 --- a/src/AIR_Operators_Setup.F90 +++ b/src/AIR_Operators_Setup.F90 @@ -193,7 +193,7 @@ subroutine get_submatrices_start_poly_coeff_comms(input_mat, our_level, air_data ! ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ call timer_start(TIMER_ID_AIR_EXTRACT) - print *, "extract afc acf start" + !print *, "extract afc acf start" ! Only reuse when coarse matrix structure is stable (amount>=2 stores MAT_RAP_DROP) if (air_data%allocated_matrices_A_ff(our_level) .AND. & @@ -232,7 +232,7 @@ subroutine get_submatrices_start_poly_coeff_comms(input_mat, our_level, air_data ! air_data%A_cf(our_level), ierr) ! end if - print *, "extract afc acf done" + !print *, "extract afc acf done" call timer_finish(TIMER_ID_AIR_EXTRACT) diff --git a/src/DDC_Modulek.kokkos.cxx b/src/DDC_Modulek.kokkos.cxx index 2ebc7f3b..f6f69464 100644 --- a/src/DDC_Modulek.kokkos.cxx +++ b/src/DDC_Modulek.kokkos.cxx @@ -114,7 +114,7 @@ PETSC_INTERN void ddc_kokkos(Mat *input_mat, const PetscReal fraction_swap, cons // pmis_int=0 means PMISR, zero_measure_c_point_int=0 pmisr_existing_measure_implicit_transpose_kokkos(aff, -1, 0, measure_d, cf_markers_aff_d, 0); - check_cf_markers_all_marked_kokkos(cf_markers_aff_d, cf_markers_aff_d.extent(0), MPI_COMM_MATRIX); + //check_cf_markers_all_marked_kokkos(cf_markers_aff_d, cf_markers_aff_d.extent(0), MPI_COMM_MATRIX); Kokkos::fence(); @@ -128,7 +128,7 @@ PETSC_INTERN void ddc_kokkos(Mat *input_mat, const PetscReal fraction_swap, cons }); Kokkos::fence(); - check_cf_markers_all_marked_kokkos(cf_markers_d, cf_markers_d.extent(0), MPI_COMM_MATRIX); + //check_cf_markers_all_marked_kokkos(cf_markers_d, cf_markers_d.extent(0), MPI_COMM_MATRIX); } return; } diff --git a/src/Device_Datak.kokkos.cxx b/src/Device_Datak.kokkos.cxx index 986a4846..39eb51e0 100644 --- a/src/Device_Datak.kokkos.cxx +++ b/src/Device_Datak.kokkos.cxx @@ -164,7 +164,7 @@ PETSC_INTERN void create_cf_is_kokkos(Mat *input_mat, IS *is_fine, IS *is_coarse // Sanity check: fine + coarse must cover every local point exactly once // (check before global-index conversion while entries are still [0, local_rows-1]) - check_cf_is_all_local_kokkos(is_fine_local_d, is_coarse_local_d, local_rows_check, MPI_COMM_MATRIX); + //check_cf_is_all_local_kokkos(is_fine_local_d, is_coarse_local_d, local_rows_check, MPI_COMM_MATRIX); // Now convert them back to global indices PetscInt global_row_start, global_row_end_plus_one; diff --git a/src/Gmres_Poly.F90 b/src/Gmres_Poly.F90 index 011a279d..b6d26eef 100644 --- a/src/Gmres_Poly.F90 +++ b/src/Gmres_Poly.F90 @@ -477,7 +477,7 @@ subroutine calculate_gmres_polynomial_coefficients_arnoldi(matrix, poly_order, c call MPI_Abort(MPI_COMM_WORLD, MPI_ERR_OTHER, errorcode) end if - print *, "about to muller" + !print *, "about to muller" ! ~~~~~~~~~~ ! Allocate space and create random numbers @@ -485,12 +485,12 @@ subroutine calculate_gmres_polynomial_coefficients_arnoldi(matrix, poly_order, c ! ~~~~~~~~~~ call create_temp_space_box_muller(matrix, subspace_size, V_n) - print *, "done muller" + !print *, "done muller" ! Create an extra vector for storage call VecDuplicate(V_n(1), w_j, ierr) - print *, "about to arnoldi" + !print *, "about to arnoldi" ! Do the Arnoldi and compute H_n and C_n ! We only compute H_n until we hit a relative residual of 1e-14 against the random rhs @@ -500,7 +500,7 @@ subroutine calculate_gmres_polynomial_coefficients_arnoldi(matrix, poly_order, c call arnoldi(matrix, poly_order, 1d-30, V_n, w_j, beta, H_n, m, C_n, y, rel_tol) if (present(user_rel_tol)) user_rel_tol = rel_tol - print *, "done arnoldi" + !print *, "done arnoldi" ! ~~~~~~~~~~~~~ ! Compute the polynomial coefficients, this is C_n(1:m, 1:m) y diff --git a/src/PMISR_Modulek.kokkos.cxx b/src/PMISR_Modulek.kokkos.cxx index a9b23b0d..861fb734 100644 --- a/src/PMISR_Modulek.kokkos.cxx +++ b/src/PMISR_Modulek.kokkos.cxx @@ -1375,7 +1375,7 @@ PETSC_INTERN void pmisr_kokkos(Mat *strength_mat, const int max_luby_steps, cons pmisr_existing_measure_cf_markers_kokkos(strength_mat, max_luby_steps, pmis_int, measure_local_d, cf_markers_d, zero_measure_c_point_int); // Sanity check: every local point must be marked F(-1) or C(1) - check_cf_markers_all_marked_kokkos(cf_markers_d, local_rows, MPI_COMM_MATRIX); + //check_cf_markers_all_marked_kokkos(cf_markers_d, local_rows, MPI_COMM_MATRIX); // If PMIS then we swap the CF markers from PMISR if (pmis_int) { From e4607f5d1afc18ae4839155146fe4189d65b690c Mon Sep 17 00:00:00 2001 From: sdargavi Date: Sat, 11 Apr 2026 02:35:25 +0100 Subject: [PATCH 60/60] Reenable gpu kokkos createsubmatrix --- src/PETSc_Helperk.kokkos.cxx | 1014 ++++++++++++---------------------- 1 file changed, 362 insertions(+), 652 deletions(-) diff --git a/src/PETSc_Helperk.kokkos.cxx b/src/PETSc_Helperk.kokkos.cxx index c62a2df1..357719b4 100644 --- a/src/PETSc_Helperk.kokkos.cxx +++ b/src/PETSc_Helperk.kokkos.cxx @@ -11,7 +11,7 @@ static PetscErrorCode check_exact_petscint_to_scalar_encoding(PetscInt max_encoded_value, MPI_Comm comm) { PetscFunctionBegin; - //PflareKokkosTrace _trace("check_exact_petscint_to_scalar_encoding"); + PflareKokkosTrace _trace("check_exact_petscint_to_scalar_encoding"); if (max_encoded_value <= 0) PetscFunctionReturn(PETSC_SUCCESS); const int digits = std::numeric_limits::digits; @@ -1970,57 +1970,7 @@ PETSC_INTERN void MatCreateSubMatrix_Seq_kokkos(Mat *input_mat, PetscIntKokkosVi PetscCallVoid(MatGetLocalSize(*input_mat, &local_rows, &local_cols)); PetscInt local_rows_row = is_row_d_d.extent(0), local_cols_col = is_col_d_d.extent(0); - - // ~~~~~~~~~~~~ - // DIAGNOSTIC (Step 1 of plan): verify is_row_d_d / is_col_d_d are in-bounds. - // If a caller supplies out-of-range indices, smap_d / device_local_i accesses - // below would silently clobber adjacent device allocations. - // ~~~~~~~~~~~~ - { - PetscInt row_min = 0, row_max = -1, col_min = 0, col_max = -1; - if (local_rows_row > 0) { - Kokkos::parallel_reduce("PFLARE_DBG_is_row_minmax", - Kokkos::RangePolicy<>(exec, 0, local_rows_row), - KOKKOS_LAMBDA(const PetscInt i, PetscInt &lmin) { - const PetscInt v = is_row_d_d(i); - if (v < lmin) lmin = v; - }, Kokkos::Min(row_min)); - Kokkos::parallel_reduce("PFLARE_DBG_is_row_max", - Kokkos::RangePolicy<>(exec, 0, local_rows_row), - KOKKOS_LAMBDA(const PetscInt i, PetscInt &lmax) { - const PetscInt v = is_row_d_d(i); - if (v > lmax) lmax = v; - }, Kokkos::Max(row_max)); - } - if (local_cols_col > 0) { - Kokkos::parallel_reduce("PFLARE_DBG_is_col_minmax", - Kokkos::RangePolicy<>(exec, 0, local_cols_col), - KOKKOS_LAMBDA(const PetscInt i, PetscInt &lmin) { - const PetscInt v = is_col_d_d(i); - if (v < lmin) lmin = v; - }, Kokkos::Min(col_min)); - Kokkos::parallel_reduce("PFLARE_DBG_is_col_max", - Kokkos::RangePolicy<>(exec, 0, local_cols_col), - KOKKOS_LAMBDA(const PetscInt i, PetscInt &lmax) { - const PetscInt v = is_col_d_d(i); - if (v > lmax) lmax = v; - }, Kokkos::Max(col_max)); - } - Kokkos::fence(); - if (local_rows_row > 0) { - PetscCheckAbort(row_min >= 0 && row_max < local_rows, PETSC_COMM_SELF, - PETSC_ERR_ARG_OUTOFRANGE, - "MatCreateSubMatrix_Seq_kokkos: is_row out of range [0,%" PetscInt_FMT ") got [%" PetscInt_FMT ",%" PetscInt_FMT "]", - local_rows, row_min, row_max); - } - if (local_cols_col > 0) { - PetscCheckAbort(col_min >= 0 && col_max < local_cols, PETSC_COMM_SELF, - PETSC_ERR_ARG_OUTOFRANGE, - "MatCreateSubMatrix_Seq_kokkos: is_col out of range [0,%" PetscInt_FMT ") got [%" PetscInt_FMT ",%" PetscInt_FMT "]", - local_cols, col_min, col_max); - } - } - + // ~~~~~~~~~~~~ // Get pointers to the i,j,vals on the device // ~~~~~~~~~~~~ @@ -2152,87 +2102,30 @@ PETSC_INTERN void MatCreateSubMatrix_Seq_kokkos(Mat *input_mat, PetscIntKokkosVi // Create i indices // ~~~~~~~~~~~~~~~ Kokkos::parallel_for( - Kokkos::RangePolicy<>(exec, 0, local_rows_row), KOKKOS_LAMBDA(PetscInt i_idx_is_row) { + Kokkos::RangePolicy<>(exec, 0, local_rows_row), KOKKOS_LAMBDA(PetscInt i_idx_is_row) { // The start of our row index comes from the scan - i_local_d(i_idx_is_row + 1) = nnz_match_local_row_d(i_idx_is_row); - }); - - // ~~~~~~~~~~~~ - // DIAGNOSTIC (Step 1b of plan): verify i_local_d's final value equals - // nnzs_match_local, and that device_local_j entries for the rows we touch - // are all inside [0, local_cols). Either inconsistency would cause the - // team kernel below to write j_local_d / a_local_d outside their bounds. - // ~~~~~~~~~~~~ - if (local_rows_row > 0) { - PetscInt i_local_last_h = 0; - auto i_local_tail = Kokkos::subview(i_local_d, local_rows_row); - Kokkos::View i_local_tail_h("PFLARE_DBG_i_local_tail"); - Kokkos::deep_copy(exec, i_local_tail_h, i_local_tail); - Kokkos::fence(); - i_local_last_h = i_local_tail_h(); - PetscCheckAbort(i_local_last_h == nnzs_match_local, PETSC_COMM_SELF, - PETSC_ERR_PLIB, - "MatCreateSubMatrix_Seq_kokkos: i_local_d tail (%" PetscInt_FMT ") != nnzs_match_local (%" PetscInt_FMT "), local_rows_row=%" PetscInt_FMT, - i_local_last_h, nnzs_match_local, local_rows_row); - - PetscInt jmin = 0, jmax = -1; - Kokkos::parallel_reduce("PFLARE_DBG_dev_j_min", - Kokkos::RangePolicy<>(exec, 0, local_rows_row), - KOKKOS_LAMBDA(const PetscInt ir, PetscInt &lmin) { - const PetscInt i = is_row_d_d(ir); - const PetscInt s = device_local_i[i]; - const PetscInt e = device_local_i[i + 1]; - for (PetscInt k = s; k < e; ++k) { - const PetscInt v = device_local_j[k]; - if (v < lmin) lmin = v; - } - }, Kokkos::Min(jmin)); - Kokkos::parallel_reduce("PFLARE_DBG_dev_j_max", - Kokkos::RangePolicy<>(exec, 0, local_rows_row), - KOKKOS_LAMBDA(const PetscInt ir, PetscInt &lmax) { - const PetscInt i = is_row_d_d(ir); - const PetscInt s = device_local_i[i]; - const PetscInt e = device_local_i[i + 1]; - for (PetscInt k = s; k < e; ++k) { - const PetscInt v = device_local_j[k]; - if (v > lmax) lmax = v; - } - }, Kokkos::Max(jmax)); - Kokkos::fence(); - if (jmax >= 0) { - PetscCheckAbort(jmin >= 0 && jmax < local_cols, PETSC_COMM_SELF, - PETSC_ERR_PLIB, - "MatCreateSubMatrix_Seq_kokkos: device_local_j out of [0,%" PetscInt_FMT ") got [%" PetscInt_FMT ",%" PetscInt_FMT "]", - local_cols, jmin, jmax); - } - } + i_local_d(i_idx_is_row + 1) = nnz_match_local_row_d(i_idx_is_row); + }); // Execute with scratch memory Kokkos::parallel_for(policy, KOKKOS_LAMBDA(const KokkosTeamMemberType& t) { - + // i_idx_is_row is the row index into the output const PetscInt i_idx_is_row = t.league_rank(); // i is the row index into the input - const PetscInt i = is_row_d_d(i_idx_is_row); + const PetscInt i = is_row_d_d(i_idx_is_row); // number of columns PetscInt ncols_local; ncols_local = device_local_i[i + 1] - device_local_i[i]; ScratchIntView scratch_indices; - // DIAGNOSTIC: ncols_local must not exceed max_nnz_local. - // If it does the scratch allocation below overruns the per-team - // budget and silently corrupts adjacent device memory. - // Use Kokkos::abort (not KOKKOS_ASSERT) so this fires unconditionally - // regardless of NDEBUG / KOKKOS_ENABLE_DEBUG build flags. - if (ncols_local > max_nnz_local) Kokkos::abort("PFLARE: ncols_local > max_nnz_local in MatCreateSubMatrix_Seq_kokkos — scratch pool overflow"); - // Allocate views directly on scratch memory // Have to use views here given alignment issues // We have of size ncols+1 to account for the exclusive scan - scratch_indices = ScratchIntView(t.team_scratch(1), ncols_local+1); - + scratch_indices = ScratchIntView(t.team_scratch(1), ncols_local+1); + // Initialize scratch Kokkos::parallel_for(Kokkos::TeamVectorRange(t, ncols_local+1), [&](const PetscInt j) { scratch_indices(j) = 0; @@ -2274,37 +2167,10 @@ PETSC_INTERN void MatCreateSubMatrix_Seq_kokkos(Mat *input_mat, PetscIntKokkosVi { // Be careful to use the correct i_idx_is_row index into i_local_d here j_local_d(i_local_d(i_idx_is_row) + scratch_indices(j)) = smap_d(device_local_j[device_local_i[i] + j]) - 1; - a_local_d(i_local_d(i_idx_is_row) + scratch_indices(j)) = device_local_vals[device_local_i[i] + j]; + a_local_d(i_local_d(i_idx_is_row) + scratch_indices(j)) = device_local_vals[device_local_i[i] + j]; } }); - }); - - // ~~~~~~~~~~~~ - // DIAGNOSTIC (Step 1c of plan): post-team-kernel sanity check on the - // produced j_local_d. Every column index handed to PETSc must be in - // [0, local_cols_col); a value outside that range would either be a - // smap_d corruption or a per-row scan / write-offset bug. - // ~~~~~~~~~~~~ - if (nnzs_match_local > 0) { - PetscInt jout_min = 0, jout_max = -1; - Kokkos::parallel_reduce("PFLARE_DBG_jlocal_min", - Kokkos::RangePolicy<>(exec, 0, nnzs_match_local), - KOKKOS_LAMBDA(const PetscInt k, PetscInt &lmin) { - const PetscInt v = j_local_d(k); - if (v < lmin) lmin = v; - }, Kokkos::Min(jout_min)); - Kokkos::parallel_reduce("PFLARE_DBG_jlocal_max", - Kokkos::RangePolicy<>(exec, 0, nnzs_match_local), - KOKKOS_LAMBDA(const PetscInt k, PetscInt &lmax) { - const PetscInt v = j_local_d(k); - if (v > lmax) lmax = v; - }, Kokkos::Max(jout_max)); - Kokkos::fence(); - PetscCheckAbort(jout_min >= 0 && jout_max < local_cols_col, PETSC_COMM_SELF, - PETSC_ERR_PLIB, - "MatCreateSubMatrix_Seq_kokkos: j_local_d out of [0,%" PetscInt_FMT ") got [%" PetscInt_FMT ",%" PetscInt_FMT "], nnzs=%" PetscInt_FMT, - local_cols_col, jout_min, jout_max, nnzs_match_local); - } + }); } // If we're reusing, we can just write directly to the existing views else @@ -2325,25 +2191,22 @@ PETSC_INTERN void MatCreateSubMatrix_Seq_kokkos(Mat *input_mat, PetscIntKokkosVi // Execute with scratch memory Kokkos::parallel_for(policy, KOKKOS_LAMBDA(const KokkosTeamMemberType& t) { - + // i_idx_is_row is the row index into the output const PetscInt i_idx_is_row = t.league_rank(); // i is the row index into the input - const PetscInt i = is_row_d_d(i_idx_is_row); + const PetscInt i = is_row_d_d(i_idx_is_row); // number of columns PetscInt ncols_local; ncols_local = device_local_i[i + 1] - device_local_i[i]; ScratchIntView scratch_indices; - // DIAGNOSTIC: same scratch-overflow guard as in the non-reuse kernel above. - KOKKOS_ASSERT(ncols_local <= max_nnz_local); - // Allocate views directly on scratch memory // Have to use views here given alignment issues // We have of size ncols+1 to account for the exclusive scan - scratch_indices = ScratchIntView(t.team_scratch(1), ncols_local+1); - + scratch_indices = ScratchIntView(t.team_scratch(1), ncols_local+1); + // Initialize scratch Kokkos::parallel_for(Kokkos::TeamVectorRange(t, ncols_local+1), [&](const PetscInt j) { scratch_indices(j) = 0; @@ -2423,443 +2286,299 @@ PETSC_INTERN void MatCreateSubMatrix_Seq_kokkos(Mat *input_mat, PetscIntKokkosVi // as the matrices, ie equivalent to MatCreateSubMatrix_MPIAIJ_SameRowDist // is_col must be sorted // This one uses the views is_row_d_d and is_col_d_d directly, rewritten to be the local indices -PETSC_INTERN void MatCreateSubMatrix_kokkos_view(Mat *input_mat, PetscIntKokkosView is_row_d_d, PetscInt global_rows_row, \ - PetscIntKokkosView is_col_d_d, PetscInt global_cols_col, const int reuse_int, Mat *output_mat, IS *rows_rows, IS *cols_cols) +PETSC_INTERN void MatCreateSubMatrix_kokkos_view(Mat *input_mat, PetscIntKokkosView &is_row_d_d, PetscInt global_rows_row, \ + PetscIntKokkosView &is_col_d_d, PetscInt global_cols_col, const int reuse_int, Mat *output_mat) { -// //PflareKokkosTrace _trace("MatCreateSubMatrix_kokkos_view"); -// PetscInt local_rows, local_cols; -// PetscInt global_rows, global_cols; -// PetscInt global_row_start, global_row_end_plus_one; -// // PetscCallVoid(MatGetOwnershipRange(*input_mat, &global_row_start, &global_row_end_plus_one)); -// PetscInt local_cols_col = is_col_d_d.extent(0); -// auto exec = PetscGetKokkosExecutionSpace(); - -// // // Are we in parallel? -// // MatType mat_type; -// MPI_Comm MPI_COMM_MATRIX; -// // PetscCallVoid(MatGetType(*input_mat, &mat_type)); - -// // const bool mpi = strcmp(mat_type, MATMPIAIJKOKKOS) == 0; -// const bool mpi = true; -// // PetscCallVoid(PetscObjectGetComm((PetscObject)*input_mat, &MPI_COMM_MATRIX)); -// // PetscCallVoid(MatGetSize(*input_mat, &global_rows, &global_cols)); -// // PetscCallVoid(MatGetLocalSize(*input_mat, &local_rows, &local_cols)); - -// Mat_MPIAIJ *mat_mpi = nullptr; -// Mat mat_local = NULL, mat_nonlocal = NULL; -// Mat output_mat_local, output_mat_nonlocal; + //PflareKokkosTrace _trace("MatCreateSubMatrix_kokkos_view"); + PetscInt local_rows, local_cols; + PetscInt global_rows, global_cols; + PetscInt global_row_start, global_row_end_plus_one; + PetscCallVoid(MatGetOwnershipRange(*input_mat, &global_row_start, &global_row_end_plus_one)); + PetscInt local_cols_col = is_col_d_d.extent(0); + auto exec = PetscGetKokkosExecutionSpace(); + + // Are we in parallel? + MatType mat_type; + MPI_Comm MPI_COMM_MATRIX; + PetscCallVoid(MatGetType(*input_mat, &mat_type)); + + const bool mpi = strcmp(mat_type, MATMPIAIJKOKKOS) == 0; + PetscCallVoid(PetscObjectGetComm((PetscObject)*input_mat, &MPI_COMM_MATRIX)); + PetscCallVoid(MatGetSize(*input_mat, &global_rows, &global_cols)); + PetscCallVoid(MatGetLocalSize(*input_mat, &local_rows, &local_cols)); + + Mat_MPIAIJ *mat_mpi = nullptr; + Mat mat_local = NULL, mat_nonlocal = NULL; + Mat output_mat_local, output_mat_nonlocal; -// PetscInt rows_ao, cols_ao; -// // if (mpi) -// // { -// // mat_mpi = (Mat_MPIAIJ *)(*input_mat)->data; -// // PetscCallVoid(MatMPIAIJGetSeqAIJ(*input_mat, &mat_local, &mat_nonlocal, NULL)); -// // PetscCallVoid(MatGetSize(mat_nonlocal, &rows_ao, &cols_ao)); + PetscInt rows_ao, cols_ao; + if (mpi) + { + mat_mpi = (Mat_MPIAIJ *)(*input_mat)->data; + PetscCallVoid(MatMPIAIJGetSeqAIJ(*input_mat, &mat_local, &mat_nonlocal, NULL)); + PetscCallVoid(MatGetSize(mat_nonlocal, &rows_ao, &cols_ao)); -// // if (reuse_int) -// // { -// // PetscCallVoid(MatMPIAIJGetSeqAIJ(*output_mat, &output_mat_local, &output_mat_nonlocal, NULL)); -// // } -// // } -// // else -// // { -// // mat_local = *input_mat; -// // if (reuse_int) output_mat_local = *output_mat; -// // } -// size_t bytes = 0; - -// // Ablation toggle (Step 2 of plan): when defined non-zero, the diagonal -// // MatCreateSubMatrix_Seq_kokkos call is replaced by PETSc's host-side -// // MatCreateSubMatrix on mat_local plus a MatConvert back to MATSEQAIJKOKKOS. -// // Used to test whether the intermittent GPU crash originates inside the -// // diag Seq_kokkos kernel chain. Reuse path is unchanged (crashes are -// // first-call only). Toggle off (set to 0) to restore the original path. -// #ifndef PFLARE_ABLATE_DIAG_SUBMAT -// #define PFLARE_ABLATE_DIAG_SUBMAT 0 -// #endif - -// // // The diagonal component -// // #if PFLARE_ABLATE_DIAG_SUBMAT -// // if (!reuse_int) -// // { -// // // Pull the (already-local) is_row / is_col indices back to the host so -// // // PETSc's CPU MatCreateSubMatrix can consume them. mat_local is a -// // // SeqAIJKokkos but PETSc's MatCreateSubMatrix dispatches to the host -// // // SeqAIJ implementation, producing a SeqAIJ result that we then convert -// // // back to SeqAIJKokkos for the downstream MatCreateMPIAIJWithSeqAIJ. -// // const PetscInt n_row_h = is_row_d_d.extent(0); -// // const PetscInt n_col_h = is_col_d_d.extent(0); -// // PetscInt *is_row_host_arr = NULL, *is_col_host_arr = NULL; -// // PetscCallVoid(PetscMalloc1(n_row_h > 0 ? n_row_h : 1, &is_row_host_arr)); -// // PetscCallVoid(PetscMalloc1(n_col_h > 0 ? n_col_h : 1, &is_col_host_arr)); -// // PetscIntKokkosViewHost is_row_h_view(is_row_host_arr, n_row_h); -// // PetscIntKokkosViewHost is_col_h_view(is_col_host_arr, n_col_h); -// // Kokkos::deep_copy(exec, is_row_h_view, is_row_d_d); -// // Kokkos::deep_copy(exec, is_col_h_view, is_col_d_d); -// // Kokkos::fence(); - -// // IS is_row_temp = NULL, is_col_temp = NULL; -// // PetscCallVoid(ISCreateGeneral(PETSC_COMM_SELF, n_row_h, is_row_host_arr, PETSC_COPY_VALUES, &is_row_temp)); -// // PetscCallVoid(ISCreateGeneral(PETSC_COMM_SELF, n_col_h, is_col_host_arr, PETSC_COPY_VALUES, &is_col_temp)); - -// // Mat tmp_host_mat = NULL; -// // PetscCallVoid(MatCreateSubMatrix(mat_local, is_row_temp, is_col_temp, MAT_INITIAL_MATRIX, &output_mat_local)); -// // // Convert the SeqAIJ host result to SeqAIJKokkos so the downstream -// // // MatCreateMPIAIJWithSeqAIJ + reuse storage hand-off still get a Kokkos -// // // seq block (matches what MatCreateSubMatrix_Seq_kokkos would have -// // // produced). -// // //PetscCallVoid(MatConvert(tmp_host_mat, MATSEQAIJKOKKOS, MAT_INITIAL_MATRIX, &output_mat_local)); - -// // //PetscCallVoid(MatDestroy(&tmp_host_mat)); -// // PetscCallVoid(ISDestroy(&is_row_temp)); -// // PetscCallVoid(ISDestroy(&is_col_temp)); -// // PetscCallVoid(PetscFree(is_row_host_arr)); -// // PetscCallVoid(PetscFree(is_col_host_arr)); -// // } -// // else -// // { -// // MatCreateSubMatrix_Seq_kokkos(&mat_local, is_row_d_d, is_col_d_d, reuse_int, &output_mat_local); -// // } -// // #else -// // MatCreateSubMatrix_Seq_kokkos(&mat_local, is_row_d_d, is_col_d_d, reuse_int, &output_mat_local); -// // #endif - -// // The off-diagonal component requires some comms -// // Basically a copy of MatCreateSubMatrix_MPIAIJ_SameRowColDist - -// // Off-diagonal ablation toggle (step 2a of plan): when non-zero, the entire -// // off-diag VecScatter + Seq_kokkos-nonlocal + MatCreateMPIAIJWithSeqAIJ path -// // is replaced by PETSc's CPU MatCreateSubMatrix on the full MPIAIJ input, -// // converted back to MATMPIAIJKOKKOS. Combine with PFLARE_ABLATE_DIAG_SUBMAT=0 -// // so that only the off-diag section is ablated while diag uses our Kokkos kernel. -// // Only the first-call (non-reuse) path is ablated, matching the observed failure mode. -// #ifndef PFLARE_ABLATE_OFFDIAG_SUBMAT -// #define PFLARE_ABLATE_OFFDIAG_SUBMAT 1 -// #endif - -// if (mpi) -// { -// #if PFLARE_ABLATE_OFFDIAG_SUBMAT -// if (!reuse_int) -// { -// // // We need global IS indices (is_row/is_col on device are already LOCAL, -// // // i.e. row_global - global_row_start; add back the offset before calling -// // // PETSc's CPU MatCreateSubMatrix which expects global indices). -// // PetscInt global_row_start_abl = 0, global_row_end_abl = 0; -// // PetscInt global_col_start_abl = 0, global_col_end_abl = 0; -// // PetscCallVoid(MatGetOwnershipRange(*input_mat, &global_row_start_abl, &global_row_end_abl)); -// // PetscCallVoid(MatGetOwnershipRangeColumn(*input_mat, &global_col_start_abl, &global_col_end_abl)); - -// // const PetscInt n_row_abl = (PetscInt)is_row_d_d.extent(0); -// // const PetscInt n_col_abl = (PetscInt)is_col_d_d.extent(0); -// // PetscInt *is_row_g_arr = NULL, *is_col_g_arr = NULL; -// // PetscCallVoid(PetscMalloc1(n_row_abl > 0 ? n_row_abl : 1, &is_row_g_arr)); -// // PetscCallVoid(PetscMalloc1(n_col_abl > 0 ? n_col_abl : 1, &is_col_g_arr)); - -// // // Copy local device indices to host then shift back to global. -// // PetscIntKokkosViewHost is_row_g_h(is_row_g_arr, n_row_abl); -// // PetscIntKokkosViewHost is_col_g_h(is_col_g_arr, n_col_abl); -// // Kokkos::deep_copy(exec, is_row_g_h, is_row_d_d); -// // Kokkos::deep_copy(exec, is_col_g_h, is_col_d_d); -// // Kokkos::fence(); -// // for (PetscInt ii = 0; ii < n_row_abl; ii++) is_row_g_arr[ii] += global_row_start_abl; -// // for (PetscInt ii = 0; ii < n_col_abl; ii++) is_col_g_arr[ii] += global_col_start_abl; - -// // IS is_row_g_abl = NULL, is_col_g_abl = NULL; -// // PetscCallVoid(ISCreateGeneral(MPI_COMM_MATRIX, n_row_abl, is_row_g_arr, PETSC_OWN_POINTER, &is_row_g_abl)); -// // PetscCallVoid(ISCreateGeneral(MPI_COMM_MATRIX, n_col_abl, is_col_g_arr, PETSC_OWN_POINTER, &is_col_g_abl)); - -// // PetscBool equal_flag; -// // PetscCallVoid(ISEqualUnsorted(is_row_g_abl, *rows_rows, &equal_flag)); - -// // PetscCheckAbort(equal_flag, MPI_COMM_MATRIX, -// // PETSC_ERR_PLIB, -// // "rows not equal"); - -// // PetscCallVoid(ISEqualUnsorted(is_col_g_abl, *cols_cols, &equal_flag)); - -// // PetscCheckAbort(equal_flag, MPI_COMM_MATRIX, -// // PETSC_ERR_PLIB, -// // "cols not equal"); - -// // Mat tmp_abl = NULL; -// //PetscCallVoid(MatCreateSubMatrix(*input_mat, is_row_g_abl, is_col_g_abl, MAT_INITIAL_MATRIX, output_mat)); - PetscCallVoid(MatCreateSubMatrix(*input_mat, *rows_rows, *cols_cols, MAT_INITIAL_MATRIX, output_mat)); -// //PetscCallVoid(MatConvert(tmp_abl, MATMPIAIJKOKKOS, MAT_INITIAL_MATRIX, output_mat)); -// //PetscCallVoid(MatDestroy(&tmp_abl)); -// //PetscCallVoid(MatDestroy(&output_mat_local)); // diag mat no longer needed -// //PetscCallVoid(ISDestroy(&is_row_g_abl)); -// //PetscCallVoid(ISDestroy(&is_col_g_abl)); - return; -// } -// #endif -// PetscIntKokkosView is_col_o_d, garray_output_d; - -// if (!reuse_int) -// { -// PetscInt isstart = 0; -// /* Get start indices on each rank for the new columns */ -// MPI_Scan(&local_cols_col, &isstart, 1, MPIU_INT, MPI_SUM, MPI_COMM_MATRIX); -// isstart -= local_cols_col; - -// // cmap values are encoded through PetscScalar and then cast back to PetscInt, -// // so guard the exact integer range before using VecScatter transport. -// // Anything larger than 9,000 trillion with 64 bit ints and 64 bit floats will break - should be fine for now -// // Can't rely on PetscSFBcast with MPIU_INT as that was intermittently breaking -// // on gpus so want to avoid -// PetscInt max_encoded_value = global_cols_col > 0 ? global_cols_col - 1 : 0; -// PetscCallVoid(check_exact_petscint_to_scalar_encoding(max_encoded_value, MPI_COMM_MATRIX)); - -// // Kokkos version of ISGetSeqIS_SameColDist_Private (mpiaij.c) -// // Uses VecScatter with PetscScalar Vecs (matching PETSc's own pattern) -// // instead of direct PetscSFBcast with MPIU_INT on temporary views. - -// std::cerr << "one " << std::endl; - -// /* (1) iscol is a sub-column vector of mat, pad it with '-1.' to form a full vector x */ -// Vec x_vec, cmap_vec; -// PetscCallVoid(MatCreateVecs(*input_mat, &x_vec, NULL)); -// PetscCallVoid(VecDuplicate(x_vec, &cmap_vec)); - -// // Fill x_vec on device: x[is_col(i)] = is_col(i), rest = -1 + if (reuse_int) + { + PetscCallVoid(MatMPIAIJGetSeqAIJ(*output_mat, &output_mat_local, &output_mat_nonlocal, NULL)); + } + } + else + { + mat_local = *input_mat; + if (reuse_int) output_mat_local = *output_mat; + } + size_t bytes = 0; + + // The diagonal component + MatCreateSubMatrix_Seq_kokkos(&mat_local, is_row_d_d, is_col_d_d, reuse_int, &output_mat_local); + + // The off-diagonal component requires some comms + // Basically a copy of MatCreateSubMatrix_MPIAIJ_SameRowColDist + if (mpi) + { + PetscIntKokkosView is_col_o_d, garray_output_d; + + if (!reuse_int) + { + PetscInt isstart = 0; + /* Get start indices on each rank for the new columns */ + MPI_Scan(&local_cols_col, &isstart, 1, MPIU_INT, MPI_SUM, MPI_COMM_MATRIX); + isstart -= local_cols_col; + + // cmap values are encoded through PetscScalar and then cast back to PetscInt, + // so guard the exact integer range before using VecScatter transport. + // Anything larger than 9,000 trillion with 64 bit ints and 64 bit floats will break - should be fine for now + // Can't rely on PetscSFBcast with MPIU_INT as that was intermittently breaking + // on gpus so want to avoid + PetscInt max_encoded_value = global_cols_col > 0 ? global_cols_col - 1 : 0; + //PetscCallVoid(check_exact_petscint_to_scalar_encoding(max_encoded_value, MPI_COMM_MATRIX)); + + // Kokkos version of ISGetSeqIS_SameColDist_Private (mpiaij.c) + // Uses VecScatter with PetscScalar Vecs (matching PETSc's own pattern) + // instead of direct PetscSFBcast with MPIU_INT on temporary views. + + //std::cerr << "one " << std::endl; + + /* (1) iscol is a sub-column vector of mat, pad it with '-1.' to form a full vector x */ + Vec x_vec, cmap_vec; + PetscCallVoid(MatCreateVecs(*input_mat, &x_vec, NULL)); + PetscCallVoid(VecDuplicate(x_vec, &cmap_vec)); + + // Fill x_vec on device: x[is_col(i)] = is_col(i), rest = -1 -// PetscScalarKokkosView x_scalar_d; -// PetscCallVoid(VecGetKokkosViewWrite(x_vec, &x_scalar_d)); -// Kokkos::deep_copy(exec, x_scalar_d, -1.0); -// Kokkos::parallel_for( -// Kokkos::RangePolicy<>(exec, 0, local_cols_col), KOKKOS_LAMBDA(PetscInt i) { -// x_scalar_d(is_col_d_d(i)) = (PetscScalar)is_col_d_d(i); -// }); -// PetscCallVoid(VecRestoreKokkosViewWrite(x_vec, &x_scalar_d)); + PetscScalarKokkosView x_scalar_d; + PetscCallVoid(VecGetKokkosViewWrite(x_vec, &x_scalar_d)); + Kokkos::deep_copy(exec, x_scalar_d, -1.0); + Kokkos::parallel_for( + Kokkos::RangePolicy<>(exec, 0, local_cols_col), KOKKOS_LAMBDA(PetscInt i) { + x_scalar_d(is_col_d_d(i)) = (PetscScalar)is_col_d_d(i); + }); + PetscCallVoid(VecRestoreKokkosViewWrite(x_vec, &x_scalar_d)); -// std::cerr << "two " << std::endl; + //std::cerr << "two " << std::endl; -// /* (2) Scatter x and cmap using Mvctx to get their off-process portions */ -// // Keep at most one active communication on Mvctx at a time. -// // While Begin/End is in flight, do not touch the corresponding send/recv buffers. -// Vec x_leaf_vec; -// PetscCallVoid(VecDuplicate(mat_mpi->lvec, &x_leaf_vec)); -// // Ensure send/receive buffers are stable before Begin. -// Kokkos::fence(); -// std::cerr << "two a " << std::endl; + /* (2) Scatter x and cmap using Mvctx to get their off-process portions */ + // Keep at most one active communication on Mvctx at a time. + // While Begin/End is in flight, do not touch the corresponding send/recv buffers. + Vec x_leaf_vec; + PetscCallVoid(VecDuplicate(mat_mpi->lvec, &x_leaf_vec)); + // Ensure send/receive buffers are stable before Begin. + Kokkos::fence(); + //std::cerr << "two a " << std::endl; -// PetscCallVoid(VecScatterBegin(mat_mpi->Mvctx, x_vec, x_leaf_vec, INSERT_VALUES, SCATTER_FORWARD)); -// // x scatter completed: x_leaf_vec is now safe to read. -// PetscCallVoid(VecScatterEnd(mat_mpi->Mvctx, x_vec, x_leaf_vec, INSERT_VALUES, SCATTER_FORWARD)); + PetscCallVoid(VecScatterBegin(mat_mpi->Mvctx, x_vec, x_leaf_vec, INSERT_VALUES, SCATTER_FORWARD)); + // x scatter completed: x_leaf_vec is now safe to read. + PetscCallVoid(VecScatterEnd(mat_mpi->Mvctx, x_vec, x_leaf_vec, INSERT_VALUES, SCATTER_FORWARD)); -// std::cerr << "two b" << std::endl; + //std::cerr << "two b" << std::endl; -// // Fill cmap_vec on device: cmap[is_col(i)] = i + isstart, rest = -1 + // Fill cmap_vec on device: cmap[is_col(i)] = i + isstart, rest = -1 -// PetscScalarKokkosView cmap_scalar_d; -// PetscCallVoid(VecGetKokkosViewWrite(cmap_vec, &cmap_scalar_d)); -// Kokkos::deep_copy(exec, cmap_scalar_d, -1.0); -// Kokkos::parallel_for( -// Kokkos::RangePolicy<>(exec, 0, local_cols_col), KOKKOS_LAMBDA(PetscInt i) { -// cmap_scalar_d(is_col_d_d(i)) = (PetscScalar)(i + isstart); -// }); -// PetscCallVoid(VecRestoreKokkosViewWrite(cmap_vec, &cmap_scalar_d)); + PetscScalarKokkosView cmap_scalar_d; + PetscCallVoid(VecGetKokkosViewWrite(cmap_vec, &cmap_scalar_d)); + Kokkos::deep_copy(exec, cmap_scalar_d, -1.0); + Kokkos::parallel_for( + Kokkos::RangePolicy<>(exec, 0, local_cols_col), KOKKOS_LAMBDA(PetscInt i) { + cmap_scalar_d(is_col_d_d(i)) = (PetscScalar)(i + isstart); + }); + PetscCallVoid(VecRestoreKokkosViewWrite(cmap_vec, &cmap_scalar_d)); -// std::cerr << "three " << std::endl; - -// Vec lcmap_vec; -// PetscCallVoid(VecDuplicate(mat_mpi->lvec, &lcmap_vec)); - -// /* (3) Count how many off-local columns match */ -// PetscInt col_ao_output = 0; - -// // One bigger for exclusive scan -// auto is_col_o_match_d = PetscIntKokkosView("is_col_o_match_d", cols_ao+1); -// Kokkos::deep_copy(exec, is_col_o_match_d, 0); - -// // Start cmap scatter only after finishing x scatter on the same Mvctx. -// // Ensure send/receive buffers are stable before Begin. -// Kokkos::fence(); -// PetscCallVoid(VecScatterBegin(mat_mpi->Mvctx, cmap_vec, lcmap_vec, INSERT_VALUES, SCATTER_FORWARD)); -// // cmap scatter completed: lcmap_vec is now safe to read. -// PetscCallVoid(VecScatterEnd(mat_mpi->Mvctx, cmap_vec, lcmap_vec, INSERT_VALUES, SCATTER_FORWARD)); - -// //if (cols_ao > 0) -// //{ -// ConstPetscScalarKokkosView lvec_scalar_d; -// PetscCallVoid(VecGetKokkosView(x_leaf_vec, &lvec_scalar_d)); - -// Kokkos::parallel_reduce("FindMatches", Kokkos::RangePolicy<>(exec, 0, cols_ao), -// KOKKOS_LAMBDA(const PetscInt i, PetscInt& thread_sum) { -// // This is the scattered x for all of the non-local columns in the input mat -// // It's not -1.0 if that column is present on another rank -// if (lvec_scalar_d(i) > -1.0) { -// thread_sum++; -// is_col_o_match_d(i) = 1; // Mark this as a match -// } -// }, -// Kokkos::Sum(col_ao_output) -// ); - -// PetscCallVoid(VecRestoreKokkosView(x_leaf_vec, &lvec_scalar_d)); -// //} - -// std::cerr << "four " << std::endl; - - -// // Need to do an exclusive scan on is_col_o_match_d to get the new local indices -// // Have to remember to go up to cols_ao+1 -// Kokkos::parallel_scan(Kokkos::RangePolicy<>(exec, 0, cols_ao+1), KOKKOS_LAMBDA(const PetscInt i, PetscInt& partial_sum, const bool is_final) { -// const int input_value = is_col_o_match_d(i); -// if (is_final) { -// is_col_o_match_d(i) = partial_sum; // Write exclusive prefix -// } -// partial_sum += input_value; // Update running total -// }); - -// // ~~~~~~~~~~~~ -// // DIAGNOSTIC (Step 1 of plan): the parallel_reduce above produced -// // col_ao_output on the host while the scan produced the per-index -// // prefix sum on device. They must agree on the total count; if they -// // don't, the size of is_col_o_d / garray_output_d below is wrong and -// // the subsequent scatter kernel will write out of bounds. -// // ~~~~~~~~~~~~ -// { -// PetscInt scan_total_h = 0; -// auto tail_sv = Kokkos::subview(is_col_o_match_d, cols_ao); -// Kokkos::View tail_h("PFLARE_DBG_scan_tail"); -// Kokkos::deep_copy(exec, tail_h, tail_sv); -// Kokkos::fence(); -// scan_total_h = tail_h(); -// PetscCheckAbort(scan_total_h == col_ao_output, MPI_COMM_MATRIX, -// PETSC_ERR_PLIB, -// "MatCreateSubMatrix_kokkos_view: parallel_reduce count (%" PetscInt_FMT ") disagrees with scan total (%" PetscInt_FMT "), cols_ao=%" PetscInt_FMT, -// col_ao_output, scan_total_h, cols_ao); -// PetscCheckAbort(col_ao_output >= 0 && col_ao_output <= cols_ao, MPI_COMM_MATRIX, -// PETSC_ERR_PLIB, -// "MatCreateSubMatrix_kokkos_view: col_ao_output=%" PetscInt_FMT " outside [0,%" PetscInt_FMT "]", -// col_ao_output, cols_ao); -// } - -// // Local indices into input garray of the columns we want to keep -// // but remember this doesn't mean garray_output = garray_input(is_col_o_d) -// // as the of columns we have in the output has changed, ie we need -// // the cmap_d given it has isstart -// is_col_o_d = PetscIntKokkosView("is_col_o_d", col_ao_output); -// garray_output_d = PetscIntKokkosView("garray_output_d", col_ao_output); - -// // Loop over all the cols in the input matrix -// //{ -// ConstPetscScalarKokkosView lcmap_scalar_d; -// PetscCallVoid(VecGetKokkosView(lcmap_vec, &lcmap_scalar_d)); - -// Kokkos::parallel_for( -// Kokkos::RangePolicy<>(exec, 0, cols_ao), KOKKOS_LAMBDA(PetscInt i) { - -// // We can tell if is_col_o_match_d had 1 in it in this position by comparing the result -// // of the exclusive scan for this index and the next one -// if (is_col_o_match_d(i+1) > is_col_o_match_d(i)) -// { -// is_col_o_d(is_col_o_match_d(i)) = i; -// garray_output_d(is_col_o_match_d(i)) = (PetscInt)lcmap_scalar_d(i); -// } -// }); -// // Fence so the parallel for finishes -// Kokkos::fence(); - -// PetscCallVoid(VecRestoreKokkosView(lcmap_vec, &lcmap_scalar_d)); -// //} - -// std::cerr << "five " << std::endl; - - -// // Cleanup Vecs -// PetscCallVoid(VecDestroy(&x_vec)); -// PetscCallVoid(VecDestroy(&x_leaf_vec)); -// PetscCallVoid(VecDestroy(&cmap_vec)); -// PetscCallVoid(VecDestroy(&lcmap_vec)); -// } -// // If we're reusing we have the iscol_o associated with the output_mat -// else -// { -// // Get the iscol_o from the output_mat -// IS iscol_o; -// /* Retrieve isrow_d, iscol_d and iscol_o from output */ -// PetscCallVoid(PetscObjectQuery((PetscObject)(*output_mat), "iscol_o", (PetscObject *)&iscol_o)); -// //PetscCheck(iscol_o, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "iscol_o passed in was not used before, cannot reuse"); - -// const PetscInt *iscol_o_indices_ptr; -// PetscCallVoid(ISGetIndices(iscol_o, &iscol_o_indices_ptr)); - -// PetscInt local_cols_iscol_o; -// PetscCallVoid(ISGetLocalSize(iscol_o, &local_cols_iscol_o)); - -// // Copy the iscol_o to the device -// auto iscol_o_view_h = PetscIntConstKokkosViewHost(iscol_o_indices_ptr, local_cols_iscol_o); -// is_col_o_d = PetscIntKokkosView("is_col_o_d", local_cols_iscol_o); -// Kokkos::deep_copy(exec, is_col_o_d, iscol_o_view_h); -// // Log copy with petsc -// bytes = iscol_o_view_h.extent(0) * sizeof(PetscInt); -// PetscCallVoid(PetscLogCpuToGpu(bytes)); -// Kokkos::fence(); - -// PetscCallVoid(ISRestoreIndices(iscol_o, &iscol_o_indices_ptr)); -// } - -// // We can now create the off-diagonal component -// Kokkos::fence(); -// MatCreateSubMatrix_Seq_kokkos(&mat_nonlocal, is_row_d_d, is_col_o_d, reuse_int, &output_mat_nonlocal); - -// // If it's our first time through we have to create our output matrix -// if (!reuse_int) -// { -// std::cerr << "six " << std::endl; - -// // Copy the garray output to the host -// PetscInt *garray_host = NULL; -// PetscCallVoid(PetscMalloc1(garray_output_d.extent(0), &garray_host)); -// PetscIntKokkosViewHost colmap_output_h = PetscIntKokkosViewHost(garray_host, garray_output_d.extent(0)); -// // Copy the garray output to the host -// Kokkos::deep_copy(exec, colmap_output_h, garray_output_d); -// Kokkos::fence(); -// bytes = colmap_output_h.extent(0) * sizeof(PetscInt); -// PetscCallVoid(PetscLogGpuToCpu(bytes)); - -// std::cerr << "seven " << std::endl; + //std::cerr << "three " << std::endl; + + Vec lcmap_vec; + PetscCallVoid(VecDuplicate(mat_mpi->lvec, &lcmap_vec)); + + /* (3) Count how many off-local columns match */ + PetscInt col_ao_output = 0; + + // One bigger for exclusive scan + auto is_col_o_match_d = PetscIntKokkosView("is_col_o_match_d", cols_ao+1); + Kokkos::deep_copy(exec, is_col_o_match_d, 0); + + // Start cmap scatter only after finishing x scatter on the same Mvctx. + // Ensure send/receive buffers are stable before Begin. + Kokkos::fence(); + PetscCallVoid(VecScatterBegin(mat_mpi->Mvctx, cmap_vec, lcmap_vec, INSERT_VALUES, SCATTER_FORWARD)); + // cmap scatter completed: lcmap_vec is now safe to read. + PetscCallVoid(VecScatterEnd(mat_mpi->Mvctx, cmap_vec, lcmap_vec, INSERT_VALUES, SCATTER_FORWARD)); + + //if (cols_ao > 0) + //{ + ConstPetscScalarKokkosView lvec_scalar_d; + PetscCallVoid(VecGetKokkosView(x_leaf_vec, &lvec_scalar_d)); + + Kokkos::parallel_reduce("FindMatches", Kokkos::RangePolicy<>(exec, 0, cols_ao), + KOKKOS_LAMBDA(const PetscInt i, PetscInt& thread_sum) { + // This is the scattered x for all of the non-local columns in the input mat + // It's not -1.0 if that column is present on another rank + if (lvec_scalar_d(i) > -1.0) { + thread_sum++; + is_col_o_match_d(i) = 1; // Mark this as a match + } + }, + Kokkos::Sum(col_ao_output) + ); + + PetscCallVoid(VecRestoreKokkosView(x_leaf_vec, &lvec_scalar_d)); + //} + + //std::cerr << "four " << std::endl; + + + // Need to do an exclusive scan on is_col_o_match_d to get the new local indices + // Have to remember to go up to cols_ao+1 + Kokkos::parallel_scan(Kokkos::RangePolicy<>(exec, 0, cols_ao+1), KOKKOS_LAMBDA(const PetscInt i, PetscInt& partial_sum, const bool is_final) { + const int input_value = is_col_o_match_d(i); + if (is_final) { + is_col_o_match_d(i) = partial_sum; // Write exclusive prefix + } + partial_sum += input_value; // Update running total + }); + + // Local indices into input garray of the columns we want to keep + // but remember this doesn't mean garray_output = garray_input(is_col_o_d) + // as the of columns we have in the output has changed, ie we need + // the cmap_d given it has isstart + is_col_o_d = PetscIntKokkosView("is_col_o_d", col_ao_output); + garray_output_d = PetscIntKokkosView("garray_output_d", col_ao_output); + + // Loop over all the cols in the input matrix + //{ + ConstPetscScalarKokkosView lcmap_scalar_d; + PetscCallVoid(VecGetKokkosView(lcmap_vec, &lcmap_scalar_d)); + + Kokkos::parallel_for( + Kokkos::RangePolicy<>(exec, 0, cols_ao), KOKKOS_LAMBDA(PetscInt i) { + + // We can tell if is_col_o_match_d had 1 in it in this position by comparing the result + // of the exclusive scan for this index and the next one + if (is_col_o_match_d(i+1) > is_col_o_match_d(i)) + { + is_col_o_d(is_col_o_match_d(i)) = i; + garray_output_d(is_col_o_match_d(i)) = (PetscInt)lcmap_scalar_d(i); + } + }); + // Fence so the parallel for finishes + Kokkos::fence(); + + PetscCallVoid(VecRestoreKokkosView(lcmap_vec, &lcmap_scalar_d)); + //} + + //std::cerr << "five " << std::endl; + + + // Cleanup Vecs + PetscCallVoid(VecDestroy(&x_vec)); + PetscCallVoid(VecDestroy(&x_leaf_vec)); + PetscCallVoid(VecDestroy(&cmap_vec)); + PetscCallVoid(VecDestroy(&lcmap_vec)); + } + // If we're reusing we have the iscol_o associated with the output_mat + else + { + // Get the iscol_o from the output_mat + IS iscol_o; + /* Retrieve isrow_d, iscol_d and iscol_o from output */ + PetscCallVoid(PetscObjectQuery((PetscObject)(*output_mat), "iscol_o", (PetscObject *)&iscol_o)); + //PetscCheck(iscol_o, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "iscol_o passed in was not used before, cannot reuse"); + + const PetscInt *iscol_o_indices_ptr; + PetscCallVoid(ISGetIndices(iscol_o, &iscol_o_indices_ptr)); + + PetscInt local_cols_iscol_o; + PetscCallVoid(ISGetLocalSize(iscol_o, &local_cols_iscol_o)); + + // Copy the iscol_o to the device + auto iscol_o_view_h = PetscIntConstKokkosViewHost(iscol_o_indices_ptr, local_cols_iscol_o); + is_col_o_d = PetscIntKokkosView("is_col_o_d", local_cols_iscol_o); + Kokkos::deep_copy(exec, is_col_o_d, iscol_o_view_h); + // Log copy with petsc + bytes = iscol_o_view_h.extent(0) * sizeof(PetscInt); + PetscCallVoid(PetscLogCpuToGpu(bytes)); + + PetscCallVoid(ISRestoreIndices(iscol_o, &iscol_o_indices_ptr)); + } + + // We can now create the off-diagonal component + Kokkos::fence(); + MatCreateSubMatrix_Seq_kokkos(&mat_nonlocal, is_row_d_d, is_col_o_d, reuse_int, &output_mat_nonlocal); + + // If it's our first time through we have to create our output matrix + if (!reuse_int) + { + //std::cerr << "six " << std::endl; + + // Copy the garray output to the host + PetscInt *garray_host = NULL; + PetscCallVoid(PetscMalloc1(garray_output_d.extent(0), &garray_host)); + PetscIntKokkosViewHost colmap_output_h = PetscIntKokkosViewHost(garray_host, garray_output_d.extent(0)); + // Copy the garray output to the host + Kokkos::deep_copy(exec, colmap_output_h, garray_output_d); + Kokkos::fence(); + bytes = colmap_output_h.extent(0) * sizeof(PetscInt); + PetscCallVoid(PetscLogGpuToCpu(bytes)); + + //std::cerr << "seven " << std::endl; -// // We can now create our MPI matrix -// PetscCallVoid(MatCreateMPIAIJWithSeqAIJ(MPI_COMM_MATRIX, global_rows_row, global_cols_col, output_mat_local, output_mat_nonlocal, garray_host, output_mat)); - -// std::cerr << "eight " << std::endl; - -// // ~~~~~~~~~~~~~~ -// // If this is the first time through, we need to store the iscol_o in the output_mat -// // We don't store the is_row_d_d or is_col_d_d like the host version does as they're super cheap to rebuild -// // ~~~~~~~~~~~~~~ -// // Copy the is_col_o_d to the host -// PetscInt *is_col_o_host = NULL; -// PetscCallVoid(PetscMalloc1(is_col_o_d.extent(0), &is_col_o_host)); -// PetscIntKokkosViewHost is_col_o_h = PetscIntKokkosViewHost(is_col_o_host, is_col_o_d.extent(0)); -// // Copy the is_col_o_d output to the host -// Kokkos::deep_copy(exec, is_col_o_h, is_col_o_d); -// Kokkos::fence(); -// bytes = is_col_o_h.extent(0) * sizeof(PetscInt); -// PetscCallVoid(PetscLogGpuToCpu(bytes)); -// // Now create an IS -// IS iscol_o; -// PetscCallVoid(ISCreateGeneral(PETSC_COMM_SELF, is_col_o_h.extent(0), is_col_o_host, PETSC_COPY_VALUES, &iscol_o)); -// // Register it with the output_mat -// PetscCallVoid(PetscObjectCompose((PetscObject)(*output_mat), "iscol_o", (PetscObject)iscol_o)); -// // The ref counter is incremented by the compose -// //PetscCallVoid(ISDestroy(&iscol_o)); - -// std::cerr << "nine " << std::endl; - -// } -// } -// else -// { -// *output_mat = output_mat_local; -// } - -// return; + // We can now create our MPI matrix + PetscCallVoid(MatCreateMPIAIJWithSeqAIJ(MPI_COMM_MATRIX, global_rows_row, global_cols_col, output_mat_local, output_mat_nonlocal, garray_host, output_mat)); + + //std::cerr << "eight " << std::endl; + + // ~~~~~~~~~~~~~~ + // If this is the first time through, we need to store the iscol_o in the output_mat + // We don't store the is_row_d_d or is_col_d_d like the host version does as they're super cheap to rebuild + // ~~~~~~~~~~~~~~ + // Copy the is_col_o_d to the host + PetscInt *is_col_o_host = NULL; + PetscCallVoid(PetscMalloc1(is_col_o_d.extent(0), &is_col_o_host)); + PetscIntKokkosViewHost is_col_o_h = PetscIntKokkosViewHost(is_col_o_host, is_col_o_d.extent(0)); + // Copy the is_col_o_d output to the host + Kokkos::deep_copy(exec, is_col_o_h, is_col_o_d); + Kokkos::fence(); + bytes = is_col_o_h.extent(0) * sizeof(PetscInt); + PetscCallVoid(PetscLogGpuToCpu(bytes)); + // Now create an IS + IS iscol_o; + PetscCallVoid(ISCreateGeneral(PETSC_COMM_SELF, is_col_o_h.extent(0), is_col_o_host, PETSC_OWN_POINTER, &iscol_o)); + // Register it with the output_mat + PetscCallVoid(PetscObjectCompose((PetscObject)(*output_mat), "iscol_o", (PetscObject)iscol_o)); + // The ref counter is incremented by the compose + PetscCallVoid(ISDestroy(&iscol_o)); + + //std::cerr << "nine " << std::endl; + + } + } + else + { + *output_mat = output_mat_local; + } + + return; } //------------------------------------------------------------------------------------------------------------------------ @@ -2875,7 +2594,6 @@ PETSC_INTERN void MatCreateSubMatrix_kokkos(Mat *input_mat, IS *is_row, IS *is_c const int our_level, const int is_row_fine_int, const int is_col_fine_int) { //PflareKokkosTrace _trace("MatCreateSubMatrix_kokkos"); - PetscInt global_row_start, global_row_end_plus_one; PetscInt global_col_start, global_col_end_plus_one; PetscCallVoid(MatGetOwnershipRange(*input_mat, &global_row_start, &global_row_end_plus_one)); @@ -2891,82 +2609,74 @@ PETSC_INTERN void MatCreateSubMatrix_kokkos(Mat *input_mat, IS *is_row, IS *is_c const int level_idx = our_level - 1; auto exec = PetscGetKokkosExecutionSpace(); - // // If we want the input is_row and is_col to be used - // if (our_level == -1) - // { - // // Get pointers to the indices on the host - // const PetscInt *is_row_indices_ptr, *is_col_indices_ptr; - // PetscCallVoid(ISGetIndices(*is_row, &is_row_indices_ptr)); - // PetscCallVoid(ISGetIndices(*is_col, &is_col_indices_ptr)); - - // PetscInt local_rows_row, local_cols_col; - // PetscCallVoid(ISGetLocalSize(*is_row, &local_rows_row)); - // PetscCallVoid(ISGetLocalSize(*is_col, &local_cols_col)); - - // // Create a host view of the existing indices - // auto is_row_view_h = PetscIntConstKokkosViewHost(is_row_indices_ptr, local_rows_row); - // is_row_d_d = PetscIntKokkosView("is_row_d_d", local_rows_row); - // auto is_col_view_h = PetscIntConstKokkosViewHost(is_col_indices_ptr, local_cols_col); - // is_col_d_d = PetscIntKokkosView("is_col_d_d", local_cols_col); - // // Copy indices to the device - // Kokkos::deep_copy(exec, is_row_d_d, is_row_view_h); - // Kokkos::deep_copy(exec, is_col_d_d, is_col_view_h); - // // The source pointers come from ISGetIndices; ensure async copies complete - // // before restoring those host buffers. - // Kokkos::fence(); - // // Log copy with petsc - // size_t bytes = is_row_view_h.extent(0) * sizeof(PetscInt); - // PetscCallVoid(PetscLogCpuToGpu(bytes)); - // bytes = is_col_view_h.extent(0) * sizeof(PetscInt); - // PetscCallVoid(PetscLogCpuToGpu(bytes)); - - // PetscCallVoid(ISRestoreIndices(*is_row, &is_row_indices_ptr)); - // PetscCallVoid(ISRestoreIndices(*is_col, &is_col_indices_ptr)); - - // // ~~~~~~~~~~~~ - // // Rewrite to local indices - // // ~~~~~~~~~~~~ - // Kokkos::parallel_for( - // Kokkos::RangePolicy<>(exec, 0, is_row_d_d.extent(0)), KOKKOS_LAMBDA(PetscInt i) { - - // is_row_d_d(i) -= global_row_start; // Make local - // }); - - // Kokkos::parallel_for( - // Kokkos::RangePolicy<>(exec, 0, is_col_d_d.extent(0)), KOKKOS_LAMBDA(PetscInt i) { - - // is_col_d_d(i) -= global_col_start; // Make local - // }); - // Kokkos::fence(); - // } - // // Instead if we tell the routine that the is_row and is_col are fine/coarse local indices - // // that already are on the device - // else - // { - // if (is_row_fine_int) - // { - // is_row_d_d = *IS_fine_views_local[level_idx]; - // } - // else - // { - // is_row_d_d = *IS_coarse_views_local[level_idx]; - // } - // if (is_col_fine_int) - // { - // is_col_d_d = *IS_fine_views_local[level_idx]; - // } - // else - // { - // is_col_d_d = *IS_coarse_views_local[level_idx]; - // } - // } - - // ### path 2 - PetscCallVoid(MatCreateSubMatrix(*input_mat, *is_row, *is_col, MAT_INITIAL_MATRIX, output_mat)); - // return; - - // ### path 1 - // MatCreateSubMatrix_kokkos_view(input_mat, is_row_d_d, global_rows_row, is_col_d_d, global_cols_col, reuse_int, output_mat, is_row, is_col); + // If we want the input is_row and is_col to be used + if (our_level == -1) + { + // Get pointers to the indices on the host + const PetscInt *is_row_indices_ptr, *is_col_indices_ptr; + PetscCallVoid(ISGetIndices(*is_row, &is_row_indices_ptr)); + PetscCallVoid(ISGetIndices(*is_col, &is_col_indices_ptr)); + + PetscInt local_rows_row, local_cols_col; + PetscCallVoid(ISGetLocalSize(*is_row, &local_rows_row)); + PetscCallVoid(ISGetLocalSize(*is_col, &local_cols_col)); + + // Create a host view of the existing indices + auto is_row_view_h = PetscIntConstKokkosViewHost(is_row_indices_ptr, local_rows_row); + is_row_d_d = PetscIntKokkosView("is_row_d_d", local_rows_row); + auto is_col_view_h = PetscIntConstKokkosViewHost(is_col_indices_ptr, local_cols_col); + is_col_d_d = PetscIntKokkosView("is_col_d_d", local_cols_col); + // Copy indices to the device + Kokkos::deep_copy(exec, is_row_d_d, is_row_view_h); + Kokkos::deep_copy(exec, is_col_d_d, is_col_view_h); + // Log copy with petsc + size_t bytes = is_row_view_h.extent(0) * sizeof(PetscInt); + PetscCallVoid(PetscLogCpuToGpu(bytes)); + bytes = is_col_view_h.extent(0) * sizeof(PetscInt); + PetscCallVoid(PetscLogCpuToGpu(bytes)); + + PetscCallVoid(ISRestoreIndices(*is_row, &is_row_indices_ptr)); + PetscCallVoid(ISRestoreIndices(*is_col, &is_col_indices_ptr)); + + // ~~~~~~~~~~~~ + // Rewrite to local indices + // ~~~~~~~~~~~~ + Kokkos::parallel_for( + Kokkos::RangePolicy<>(exec, 0, is_row_d_d.extent(0)), KOKKOS_LAMBDA(PetscInt i) { + + is_row_d_d(i) -= global_row_start; // Make local + }); + + Kokkos::parallel_for( + Kokkos::RangePolicy<>(exec, 0, is_col_d_d.extent(0)), KOKKOS_LAMBDA(PetscInt i) { + + is_col_d_d(i) -= global_col_start; // Make local + }); + Kokkos::fence(); + } + // Instead if we tell the routine that the is_row and is_col are fine/coarse local indices + // that already are on the device + else + { + if (is_row_fine_int) + { + is_row_d_d = *IS_fine_views_local[level_idx]; + } + else + { + is_row_d_d = *IS_coarse_views_local[level_idx]; + } + if (is_col_fine_int) + { + is_col_d_d = *IS_fine_views_local[level_idx]; + } + else + { + is_col_d_d = *IS_coarse_views_local[level_idx]; + } + } + + MatCreateSubMatrix_kokkos_view(input_mat, is_row_d_d, global_rows_row, is_col_d_d, global_cols_col, reuse_int, output_mat); return; }