diff --git a/include/kokkos_helper.hpp b/include/kokkos_helper.hpp index 7e120072..b8e6a3e9 100644 --- a/include/kokkos_helper.hpp +++ b/include/kokkos_helper.hpp @@ -14,6 +14,23 @@ #include #include #include +#include + +struct PflareKokkosTrace { + const char *name; + PflareKokkosTrace(const char *n) : name(n) { + int rank = 0; + MPI_Comm_rank(MPI_COMM_WORLD, &rank); + fprintf(stderr, "[PFLARE kokkos rank=%d] Entering %s\n", rank, name); + fflush(stderr); + } + ~PflareKokkosTrace() { + int rank = 0; + MPI_Comm_rank(MPI_COMM_WORLD, &rank); + fprintf(stderr, "[PFLARE kokkos rank=%d] Leaving %s\n", rank, name); + fflush(stderr); + } +}; using DefaultExecutionSpace = Kokkos::DefaultExecutionSpace; using DefaultMemorySpace = Kokkos::DefaultExecutionSpace::memory_space; @@ -34,6 +51,7 @@ using Scratch2DScalarView = Kokkos::View; PETSC_INTERN void mat_duplicate_copy_plus_diag_kokkos(Mat *, int, Mat *); +PETSC_INTERN void mat_sync(Mat *); PETSC_INTERN void rewrite_j_global_to_local(PetscInt, PetscInt&, PetscIntKokkosView, PetscInt**); PETSC_INTERN void create_cf_is_device_kokkos(Mat *input_mat, const int match_cf, PetscIntKokkosView &is_local_d); PETSC_INTERN void pmisr_existing_measure_cf_markers_kokkos(Mat *strength_mat, const int max_luby_steps, const int pmis_int, PetscScalarKokkosView &measure_local_d, intKokkosView &cf_markers_d, const int zero_measure_c_point_int); @@ -146,4 +164,104 @@ PetscInt binary_search_sorted(const ViewType &sorted_view, const PetscInt size, return -1; } +// Check that every entry in cf_markers_d is either -1 (F) or 1 (C). +// Calls MPI_Abort if any local point is not marked. +inline void check_cf_markers_all_marked_kokkos( + const intKokkosView &cf_markers_d, + const PetscInt local_rows, + MPI_Comm MPI_COMM_MATRIX) +{ + auto exec = PetscGetKokkosExecutionSpace(); + PetscInt bad_count = 0; + Kokkos::parallel_reduce( + "check_cf_markers", + Kokkos::RangePolicy<>(exec, 0, local_rows), + KOKKOS_LAMBDA(const PetscInt i, PetscInt &count) { + if (cf_markers_d(i) != -1 && cf_markers_d(i) != 1) count++; + }, bad_count); + Kokkos::fence(); + int rank = 0; + MPI_Comm_rank(MPI_COMM_MATRIX, &rank); + if (bad_count > 0) { + fprintf(stderr, + "[PFLARE kokkos rank=%d] ERROR check_cf_markers_all_marked_kokkos: " + "%d / %d local points are NOT marked F or C\n", + rank, (int)bad_count, (int)local_rows); + fflush(stderr); + MPI_Abort(MPI_COMM_MATRIX, 1); + } + // else { + // fprintf(stderr, + // "[PFLARE kokkos rank=%d] check_cf_markers_all_marked_kokkos: " + // "all %d local points marked F or C OK\n", + // rank, (int)local_rows); + // fflush(stderr); + // } +} + +// Check that is_fine_local_d and is_coarse_local_d together cover every local +// point [0, local_rows-1] exactly once (no missing, no duplicates). +// Call before global-index conversion (entries are local offsets [0, local_rows-1]). +// Calls MPI_Abort if any point is missing or duplicated. +inline void check_cf_is_all_local_kokkos( + const PetscIntKokkosView &is_fine_local_d, + const PetscIntKokkosView &is_coarse_local_d, + const PetscInt local_rows, + MPI_Comm MPI_COMM_MATRIX) +{ + auto exec = PetscGetKokkosExecutionSpace(); + int rank = 0; + MPI_Comm_rank(MPI_COMM_MATRIX, &rank); + + // Allocate hit-count array, initialised to 0 + intKokkosView hit_count("hit_count", local_rows); + Kokkos::deep_copy(exec, hit_count, 0); + + // Mark each fine index (atomic to catch duplicates within the fine set) + Kokkos::parallel_for( + "check_cf_is_mark_fine", + Kokkos::RangePolicy<>(exec, 0, (PetscInt)is_fine_local_d.extent(0)), + KOKKOS_LAMBDA(const PetscInt i) { + const PetscInt idx = is_fine_local_d(i); + if (idx >= 0 && idx < local_rows) + Kokkos::atomic_add(&hit_count(idx), 1); + }); + + // Mark each coarse index + Kokkos::parallel_for( + "check_cf_is_mark_coarse", + Kokkos::RangePolicy<>(exec, 0, (PetscInt)is_coarse_local_d.extent(0)), + KOKKOS_LAMBDA(const PetscInt i) { + const PetscInt idx = is_coarse_local_d(i); + if (idx >= 0 && idx < local_rows) + Kokkos::atomic_add(&hit_count(idx), 1); + }); + + // Count any point not hit exactly once + PetscInt bad_count = 0; + Kokkos::parallel_reduce( + "check_cf_is_count_bad", + Kokkos::RangePolicy<>(exec, 0, local_rows), + KOKKOS_LAMBDA(const PetscInt i, PetscInt &count) { + if (hit_count(i) != 1) count++; + }, bad_count); + + Kokkos::fence(); + + if (bad_count > 0) { + fprintf(stderr, + "[PFLARE kokkos rank=%d] ERROR check_cf_is_all_local_kokkos: " + "%d / %d local points are not covered exactly once by fine+coarse IS\n", + rank, (int)bad_count, (int)local_rows); + fflush(stderr); + MPI_Abort(MPI_COMM_MATRIX, 1); + } else { + fprintf(stderr, + "[PFLARE kokkos rank=%d] check_cf_is_all_local_kokkos: " + "fine=%d coarse=%d, all %d local points covered exactly once OK\n", + rank, (int)is_fine_local_d.extent(0), (int)is_coarse_local_d.extent(0), (int)local_rows); + fflush(stderr); + } +} + #endif \ No newline at end of file diff --git a/src/AIR_MG_Setup.F90 b/src/AIR_MG_Setup.F90 index 244c9788..ccf705d0 100644 --- a/src/AIR_MG_Setup.F90 +++ b/src/AIR_MG_Setup.F90 @@ -155,7 +155,9 @@ subroutine setup_air_pcmg(amat, pmat, air_data, pcmg_input) ! We already know how many coarse levels we have if we are re-using if (.NOT. air_data%allocated_matrices_A_ff(our_level) .AND. & our_level .ge. air_data%options%auto_truncate_start_level .AND. & - air_data%options%auto_truncate_start_level /= -1) then + air_data%options%auto_truncate_start_level /= -1) then + + !print *, "starting auto truncate check on level ", our_level call timer_start(TIMER_ID_AIR_TRUNCATE) @@ -168,6 +170,8 @@ subroutine setup_air_pcmg(amat, pmat, air_data, pcmg_input) proc_stride, & air_data%inv_coarsest_poly_data) + !print *, "starting approx inverse ", our_level + ! Start the approximate inverse we'll use on this level call start_approximate_inverse(air_data%coarse_matrix(our_level), & air_data%inv_coarsest_poly_data%inverse_type, & @@ -189,6 +193,8 @@ subroutine setup_air_pcmg(amat, pmat, air_data, pcmg_input) call VecDuplicate(rand_vec, sol_vec, ierr) call VecDuplicate(rand_vec, temp_vec, ierr) + !print *, "starting finish approx inverse ", our_level + ! Finish our approximate inverse call finish_approximate_inverse(air_data%coarse_matrix(our_level), & air_data%inv_coarsest_poly_data%inverse_type, & @@ -209,6 +215,8 @@ subroutine setup_air_pcmg(amat, pmat, air_data, pcmg_input) air_data%inv_coarsest_poly_data%inverse_type == PFLAREINV_NEWTON_NO_EXTRA) .AND. & air_data%options%coarsest_matrix_free_polys) then + !print *, "starting matvecs residual ", our_level + if (air_data%options%coarsest_diag_scale_polys) then call petsc_matvec_right_scale_poly_newton_residual_mf(air_data%inv_A_ff(our_level), rand_vec, temp_vec) else @@ -225,6 +233,8 @@ subroutine setup_air_pcmg(amat, pmat, air_data, pcmg_input) call VecAXPY(temp_vec, -1d0, rand_vec, ierr) end if + !print *, "computing norms ", our_level + ! Get the achieved norm call VecNorm(temp_vec, NORM_2, achieved_rel_tol, ierr) @@ -253,6 +263,8 @@ subroutine setup_air_pcmg(amat, pmat, air_data, pcmg_input) call timer_finish(TIMER_ID_AIR_TRUNCATE) end if + !print *, "starting cf splitting ", our_level + ! ~~~~~~~~~~~~ ! Compute the coarsening ! ~~~~~~~~~~~~ diff --git a/src/AIR_Operators_Setup.F90 b/src/AIR_Operators_Setup.F90 index 25cff957..e8ec2f2c 100644 --- a/src/AIR_Operators_Setup.F90 +++ b/src/AIR_Operators_Setup.F90 @@ -191,7 +191,9 @@ subroutine get_submatrices_start_poly_coeff_comms(input_mat, our_level, air_data ! ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ! Pull out the rest of the sub-matrices ! ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - call timer_start(TIMER_ID_AIR_EXTRACT) + call timer_start(TIMER_ID_AIR_EXTRACT) + + !print *, "extract afc acf start" ! Only reuse when coarse matrix structure is stable (amount>=2 stores MAT_RAP_DROP) if (air_data%allocated_matrices_A_ff(our_level) .AND. & @@ -199,22 +201,38 @@ subroutine get_submatrices_start_poly_coeff_comms(input_mat, our_level, air_data REUSE_MAT_ACTIVE(MAT_RAP_DROP, air_data%options%reuse_amount)) then call MatCreateSubMatrixWrapper(input_mat, & air_data%IS_fine_index(our_level), air_data%IS_coarse_index(our_level), MAT_REUSE_MATRIX, & - air_data%A_fc(our_level), & - our_level = our_level, is_row_fine = .TRUE., is_col_fine = .FALSE.) + air_data%A_fc(our_level)) call MatCreateSubMatrixWrapper(input_mat, & air_data%IS_coarse_index(our_level), air_data%IS_fine_index(our_level), MAT_REUSE_MATRIX, & - air_data%A_cf(our_level), & - our_level = our_level, is_row_fine = .FALSE., is_col_fine = .TRUE.) + air_data%A_cf(our_level)) else call MatCreateSubMatrixWrapper(input_mat, & air_data%IS_fine_index(our_level), air_data%IS_coarse_index(our_level), MAT_INITIAL_MATRIX, & - air_data%A_fc(our_level), & - our_level = our_level, is_row_fine = .TRUE., is_col_fine = .FALSE.) + air_data%A_fc(our_level)) call MatCreateSubMatrixWrapper(input_mat, & air_data%IS_coarse_index(our_level), air_data%IS_fine_index(our_level), MAT_INITIAL_MATRIX, & - air_data%A_cf(our_level), & - our_level = our_level, is_row_fine = .FALSE., is_col_fine = .TRUE.) + air_data%A_cf(our_level)) end if + ! Only reuse when coarse matrix structure is stable (amount>=2 stores MAT_RAP_DROP) + ! if (air_data%allocated_matrices_A_ff(our_level) .AND. & + ! air_data%options%reuse_sparsity .AND. & + ! REUSE_MAT_ACTIVE(MAT_RAP_DROP, air_data%options%reuse_amount)) then + ! call MatCreateSubMatrix(input_mat, & + ! air_data%IS_fine_index(our_level), air_data%IS_coarse_index(our_level), MAT_REUSE_MATRIX, & + ! air_data%A_fc(our_level), ierr) + ! call MatCreateSubMatrix(input_mat, & + ! air_data%IS_coarse_index(our_level), air_data%IS_fine_index(our_level), MAT_REUSE_MATRIX, & + ! air_data%A_cf(our_level),ierr) + ! else + ! call MatCreateSubMatrix(input_mat, & + ! air_data%IS_fine_index(our_level), air_data%IS_coarse_index(our_level), MAT_INITIAL_MATRIX, & + ! air_data%A_fc(our_level), ierr) + ! call MatCreateSubMatrix(input_mat, & + ! air_data%IS_coarse_index(our_level), air_data%IS_fine_index(our_level), MAT_INITIAL_MATRIX, & + ! air_data%A_cf(our_level), ierr) + ! end if + + !print *, "extract afc acf done" call timer_finish(TIMER_ID_AIR_EXTRACT) diff --git a/src/DDC_Modulek.kokkos.cxx b/src/DDC_Modulek.kokkos.cxx index 87d0310d..f6f69464 100644 --- a/src/DDC_Modulek.kokkos.cxx +++ b/src/DDC_Modulek.kokkos.cxx @@ -15,11 +15,16 @@ // You have to explicitly call copy_cf_markers_d2h(cf_markers_local) to do this PETSC_INTERN void ddc_kokkos(Mat *input_mat, const PetscReal fraction_swap, const PetscReal max_dd_ratio, const PetscReal max_dd_ratio_achieved, Mat *aff, PetscReal *random_numbers) { - // Can't use the global directly within the parallel + //PflareKokkosTrace _trace("ddc_kokkos"); + // Can't use the global directly within the parallel // regions on the device intKokkosView cf_markers_d = cf_markers_local_d; PetscScalarKokkosView diag_dom_ratio_d = diag_dom_ratio_local_d; PetscIntKokkosView is_fine_local_d; + // Equivalent to calling MatSeqAIJKokkosSyncDevice which is petsc intern + mat_sync(input_mat); + MPI_Comm MPI_COMM_MATRIX; + PetscCallVoid(PetscObjectGetComm((PetscObject)*input_mat, &MPI_COMM_MATRIX)); const int match_cf = -1; // F_POINT == -1 create_cf_is_device_kokkos(input_mat, match_cf, is_fine_local_d); @@ -64,11 +69,15 @@ PETSC_INTERN void ddc_kokkos(Mat *input_mat, const PetscReal fraction_swap, cons // recompute // ~~~~~~~~~~~~~~~ { + Kokkos::fence(); + // Create measure and cf_markers for Aff PetscScalarKokkosView measure_d("measure_d", local_rows_aff); intKokkosView cf_markers_aff_d("cf_markers_aff_d", local_rows_aff); Kokkos::deep_copy(exec, cf_markers_aff_d, 0); + Kokkos::fence(); + // Copy the random numbers from host to device // These are generated in the Fortran wrapper so CPU and Kokkos use the same randoms PetscScalarKokkosViewHost random_h(random_numbers, local_rows_aff); @@ -79,6 +88,8 @@ PETSC_INTERN void ddc_kokkos(Mat *input_mat, const PetscReal fraction_swap, cons const PetscReal max_scale = std::max(10.0, max_dd_ratio_achieved * 2.0); const PetscReal target_ratio = max_dd_ratio; + Kokkos::fence(); + // Build the measure: // pmisr_existing_measure_cf_markers tags the smallest measure as F points // So we feed in measure = max(10, max_achieved*2) - (diag_dom_ratio - random/1e10) @@ -103,6 +114,10 @@ PETSC_INTERN void ddc_kokkos(Mat *input_mat, const PetscReal fraction_swap, cons // pmis_int=0 means PMISR, zero_measure_c_point_int=0 pmisr_existing_measure_implicit_transpose_kokkos(aff, -1, 0, measure_d, cf_markers_aff_d, 0); + //check_cf_markers_all_marked_kokkos(cf_markers_aff_d, cf_markers_aff_d.extent(0), MPI_COMM_MATRIX); + + Kokkos::fence(); + // Swap F-tagged points back into cf_markers_d Kokkos::parallel_for( Kokkos::RangePolicy<>(exec, 0, local_rows_aff), KOKKOS_LAMBDA(PetscInt i) { @@ -112,6 +127,8 @@ PETSC_INTERN void ddc_kokkos(Mat *input_mat, const PetscReal fraction_swap, cons } }); Kokkos::fence(); + + //check_cf_markers_all_marked_kokkos(cf_markers_d, cf_markers_d.extent(0), MPI_COMM_MATRIX); } return; } diff --git a/src/Device_Datak.kokkos.cxx b/src/Device_Datak.kokkos.cxx index b11eb31f..39eb51e0 100644 --- a/src/Device_Datak.kokkos.cxx +++ b/src/Device_Datak.kokkos.cxx @@ -15,12 +15,16 @@ PetscScalarKokkosView diag_dom_ratio_local_d; // Copy the global cf_markers_local_d back to the host PETSC_INTERN void copy_cf_markers_d2h(int *cf_markers_local) { + //PflareKokkosTrace _trace("copy_cf_markers_d2h"); // Host wrapper for cf_markers_local intKokkosViewHost cf_markers_local_h(cf_markers_local, cf_markers_local_d.extent(0)); + auto exec = PetscGetKokkosExecutionSpace(); + // Now copy device cf_markers_local_d back to host // Device to host so don't need to specify exec space - Kokkos::deep_copy(cf_markers_local_h, cf_markers_local_d); + Kokkos::deep_copy(exec, cf_markers_local_h, cf_markers_local_d); + Kokkos::fence(); // Log copy with petsc size_t bytes = cf_markers_local_d.extent(0) * sizeof(int); PetscCallVoid(PetscLogGpuToCpu(bytes)); @@ -33,12 +37,16 @@ PETSC_INTERN void copy_cf_markers_d2h(int *cf_markers_local) // Copy the global diag_dom_ratio_local_d back to the host PETSC_INTERN void copy_diag_dom_ratio_d2h(PetscReal *diag_dom_ratio_local) { + //PflareKokkosTrace _trace("copy_diag_dom_ratio_d2h"); // Host wrapper for diag_dom_ratio_local PetscScalarKokkosViewHost diag_dom_ratio_h(diag_dom_ratio_local, diag_dom_ratio_local_d.extent(0)); + auto exec = PetscGetKokkosExecutionSpace(); + // Copy device diag_dom_ratio_local_d back to host // Device to host so don't need to specify exec space - Kokkos::deep_copy(diag_dom_ratio_h, diag_dom_ratio_local_d); + Kokkos::deep_copy(exec, diag_dom_ratio_h, diag_dom_ratio_local_d); + Kokkos::fence(); // Log copy with petsc size_t bytes = diag_dom_ratio_local_d.extent(0) * sizeof(PetscReal); PetscCallVoid(PetscLogGpuToCpu(bytes)); @@ -51,6 +59,7 @@ PETSC_INTERN void copy_diag_dom_ratio_d2h(PetscReal *diag_dom_ratio_local) // Delete the global cf_markers_local_d PETSC_INTERN void delete_device_cf_markers() { + //PflareKokkosTrace _trace("delete_device_cf_markers"); // Delete the device view - this assigns an empty view // and hence the old view has its ref counter decremented cf_markers_local_d = intKokkosView(); @@ -63,6 +72,7 @@ PETSC_INTERN void delete_device_cf_markers() // Delete the global diag_dom_ratio_local_d PETSC_INTERN void delete_device_diag_dom_ratio() { + //PflareKokkosTrace _trace("delete_device_diag_dom_ratio"); // Delete the device view - this assigns an empty view // and hence the old view has its ref counter decremented diag_dom_ratio_local_d = PetscScalarKokkosView(); @@ -75,6 +85,7 @@ PETSC_INTERN void delete_device_diag_dom_ratio() // Creates the device local indices for F or C points based on the global cf_markers_local_d PETSC_INTERN void create_cf_is_device_kokkos(Mat *input_mat, const int match_cf, PetscIntKokkosView &is_local_d) { + //PflareKokkosTrace _trace("create_cf_is_device_kokkos"); PetscInt local_rows, local_cols; PetscCallVoid(MatGetLocalSize(*input_mat, &local_rows, &local_cols)); auto exec = PetscGetKokkosExecutionSpace(); @@ -109,7 +120,8 @@ PETSC_INTERN void create_cf_is_device_kokkos(Mat *input_mat, const int match_cf, // The last entry in point_offsets_d is the total number of points that match match_cf PetscInt local_rows_row = 0; // Device to host so don't need to specify exec space - Kokkos::deep_copy(local_rows_row, Kokkos::subview(point_offsets_d, local_rows)); + Kokkos::deep_copy(exec, local_rows_row, Kokkos::subview(point_offsets_d, local_rows)); + Kokkos::fence(); // This will be equivalent to is_fine - global_row_start, ie the local indices is_local_d = PetscIntKokkosView("is_local_d", local_rows_row); @@ -134,10 +146,14 @@ PETSC_INTERN void create_cf_is_device_kokkos(Mat *input_mat, const int match_cf, // Creates the host IS is_fine and is_coarse based on the global cf_markers_local_d PETSC_INTERN void create_cf_is_kokkos(Mat *input_mat, IS *is_fine, IS *is_coarse) { + //PflareKokkosTrace _trace("create_cf_is_kokkos"); PetscIntKokkosView is_fine_local_d, is_coarse_local_d; MPI_Comm MPI_COMM_MATRIX; PetscCallVoid(PetscObjectGetComm((PetscObject)*input_mat, &MPI_COMM_MATRIX)); + PetscInt local_rows_check, local_cols_check; + PetscCallVoid(MatGetLocalSize(*input_mat, &local_rows_check, &local_cols_check)); + // Create the local f point indices const int match_fine = -1; // F_POINT == -1 create_cf_is_device_kokkos(input_mat, match_fine, is_fine_local_d); @@ -146,6 +162,10 @@ PETSC_INTERN void create_cf_is_kokkos(Mat *input_mat, IS *is_fine, IS *is_coarse const int match_coarse = 1; // C_POINT == 1 create_cf_is_device_kokkos(input_mat, match_coarse, is_coarse_local_d); + // Sanity check: fine + coarse must cover every local point exactly once + // (check before global-index conversion while entries are still [0, local_rows-1]) + //check_cf_is_all_local_kokkos(is_fine_local_d, is_coarse_local_d, local_rows_check, MPI_COMM_MATRIX); + // Now convert them back to global indices PetscInt global_row_start, global_row_end_plus_one; PetscCallVoid(MatGetOwnershipRange(*input_mat, &global_row_start, &global_row_end_plus_one)); @@ -173,10 +193,13 @@ PETSC_INTERN void create_cf_is_kokkos(Mat *input_mat, IS *is_fine, IS *is_coarse PetscCallVoid(PetscMalloc1(n_coarse, &is_coarse_array)); PetscIntKokkosViewHost is_coarse_h = PetscIntKokkosViewHost(is_coarse_array, n_coarse); + Kokkos::fence(); + // Copy over the indices to the host // Device to host so don't need to specify exec space - Kokkos::deep_copy(is_fine_h, is_fine_local_d); - Kokkos::deep_copy(is_coarse_h, is_coarse_local_d); + Kokkos::deep_copy(exec, is_fine_h, is_fine_local_d); + Kokkos::deep_copy(exec, is_coarse_h, is_coarse_local_d); + Kokkos::fence(); // Log copy with petsc size_t bytes_fine = is_fine_local_d.extent(0) * sizeof(PetscInt); size_t bytes_coarse = is_coarse_local_d.extent(0) * sizeof(PetscInt); diff --git a/src/Gmres_Poly.F90 b/src/Gmres_Poly.F90 index 1e6e7926..b6d26eef 100644 --- a/src/Gmres_Poly.F90 +++ b/src/Gmres_Poly.F90 @@ -477,15 +477,21 @@ subroutine calculate_gmres_polynomial_coefficients_arnoldi(matrix, poly_order, c call MPI_Abort(MPI_COMM_WORLD, MPI_ERR_OTHER, errorcode) end if + !print *, "about to muller" + ! ~~~~~~~~~~ ! Allocate space and create random numbers ! The first vec has random numbers in it ! ~~~~~~~~~~ call create_temp_space_box_muller(matrix, subspace_size, V_n) + + !print *, "done muller" ! Create an extra vector for storage call VecDuplicate(V_n(1), w_j, ierr) + !print *, "about to arnoldi" + ! Do the Arnoldi and compute H_n and C_n ! We only compute H_n until we hit a relative residual of 1e-14 against the random rhs ! or we hit the given poly_order @@ -494,6 +500,8 @@ subroutine calculate_gmres_polynomial_coefficients_arnoldi(matrix, poly_order, c call arnoldi(matrix, poly_order, 1d-30, V_n, w_j, beta, H_n, m, C_n, y, rel_tol) if (present(user_rel_tol)) user_rel_tol = rel_tol + !print *, "done arnoldi" + ! ~~~~~~~~~~~~~ ! Compute the polynomial coefficients, this is C_n(1:m, 1:m) y ! ~~~~~~~~~~~~~ diff --git a/src/Gmres_Polyk.kokkos.cxx b/src/Gmres_Polyk.kokkos.cxx index 3df0915a..c51facac 100644 --- a/src/Gmres_Polyk.kokkos.cxx +++ b/src/Gmres_Polyk.kokkos.cxx @@ -8,6 +8,7 @@ PETSC_INTERN void mat_mult_powers_share_sparsity_kokkos(Mat *input_mat, const int poly_order, const int poly_sparsity_order, PetscReal *coefficients, \ const int reuse_int_reuse_mat, Mat *reuse_mat, const int reuse_int_cmat, Mat *output_mat) { + //PflareKokkosTrace _trace("mat_mult_powers_share_sparsity_kokkos"); MPI_Comm MPI_COMM_MATRIX; PetscInt local_rows, local_cols; PetscInt global_row_start_temp, global_row_end_plus_one_temp; @@ -18,6 +19,8 @@ PETSC_INTERN void mat_mult_powers_share_sparsity_kokkos(Mat *input_mat, const in PetscInt one = 1; bool deallocate_submatrices = false; + mat_sync(input_mat); + PetscCallVoid(MatGetType(*input_mat, &mat_type)); // Are we in parallel? const bool mpi = strcmp(mat_type, MATMPIAIJKOKKOS) == 0; @@ -162,6 +165,7 @@ PETSC_INTERN void mat_mult_powers_share_sparsity_kokkos(Mat *input_mat, const in // Get pointers to the i,j,vals on the device // This should happen after all the (potentially) host matscale, mataxpy and matshift above // ~~~~~~~~~~~~ + Kokkos::fence(); const PetscInt *device_submat_i = nullptr, *device_submat_j = nullptr; PetscMemType mtype; PetscScalar *device_submat_vals = nullptr; diff --git a/src/Grid_Transferk.kokkos.cxx b/src/Grid_Transferk.kokkos.cxx index d48fe6d8..7746df2e 100644 --- a/src/Grid_Transferk.kokkos.cxx +++ b/src/Grid_Transferk.kokkos.cxx @@ -7,6 +7,7 @@ // Generate one point classical prolongator but with kokkos - keeping everything on the device PETSC_INTERN void generate_one_point_with_one_entry_from_sparse_kokkos(Mat *input_mat, Mat *output_mat) { + //PflareKokkosTrace _trace("generate_one_point_with_one_entry_from_sparse_kokkos"); MPI_Comm MPI_COMM_MATRIX; PetscInt local_rows, local_cols, global_rows, global_cols; PetscInt global_row_start, global_row_end_plus_one; @@ -16,6 +17,8 @@ PETSC_INTERN void generate_one_point_with_one_entry_from_sparse_kokkos(Mat *inpu PetscInt nnzs_match_local, nnzs_match_nonlocal; Mat output_mat_local, output_mat_nonlocal; + mat_sync(input_mat); + PetscCallVoid(MatGetType(*input_mat, &mat_type)); // Are we in parallel? const bool mpi = strcmp(mat_type, MATMPIAIJKOKKOS) == 0; @@ -56,6 +59,7 @@ PETSC_INTERN void generate_one_point_with_one_entry_from_sparse_kokkos(Mat *inpu // ~~~~~~~~~~~~ // Get pointers to the i,j,vals on the device // ~~~~~~~~~~~~ + Kokkos::fence(); const PetscInt *device_local_i = nullptr, *device_local_j = nullptr, *device_nonlocal_i = nullptr, *device_nonlocal_j = nullptr; PetscMemType mtype; PetscScalar *device_local_vals = nullptr, *device_nonlocal_vals = nullptr; @@ -305,6 +309,7 @@ PETSC_INTERN void generate_one_point_with_one_entry_from_sparse_kokkos(Mat *inpu PETSC_INTERN void compute_P_from_W_kokkos(Mat *input_mat, PetscInt global_row_start, IS *is_fine, \ IS *is_coarse, int identity_int, int reuse_int, Mat *output_mat) { + //PflareKokkosTrace _trace("compute_P_from_W_kokkos"); MPI_Comm MPI_COMM_MATRIX; PetscInt global_row_start_W, global_row_end_plus_one_W; PetscInt global_col_start_W, global_col_end_plus_one_W; @@ -315,6 +320,8 @@ PETSC_INTERN void compute_P_from_W_kokkos(Mat *input_mat, PetscInt global_row_st PetscInt nnzs_match_local, nnzs_match_nonlocal; Mat output_mat_local, output_mat_nonlocal; + mat_sync(input_mat); + PetscCallVoid(MatGetType(*input_mat, &mat_type)); // Are we in parallel? const bool mpi = strcmp(mat_type, MATMPIAIJKOKKOS) == 0; @@ -359,6 +366,7 @@ PETSC_INTERN void compute_P_from_W_kokkos(Mat *input_mat, PetscInt global_row_st PetscCallVoid(PetscLogCpuToGpu(bytes)); bytes = coarse_view_h.extent(0) * sizeof(PetscInt); PetscCallVoid(PetscLogCpuToGpu(bytes)); + Kokkos::fence(); local_cols_coarse = local_rows_coarse; local_cols = local_rows_coarse + local_rows_fine; @@ -375,6 +383,7 @@ PETSC_INTERN void compute_P_from_W_kokkos(Mat *input_mat, PetscInt global_row_st // ~~~~~~~~~~~~ // Get pointers to the i,j,vals on the device // ~~~~~~~~~~~~ + Kokkos::fence(); const PetscInt *device_local_i = nullptr, *device_local_j = nullptr, *device_nonlocal_i = nullptr, *device_nonlocal_j = nullptr; PetscMemType mtype; PetscScalar *device_local_vals = nullptr, *device_nonlocal_vals = nullptr; @@ -596,6 +605,7 @@ PETSC_INTERN void compute_P_from_W_kokkos(Mat *input_mat, PetscInt global_row_st // Annoyingly there isn't currently the ability to get views for i (or j) const PetscInt *device_local_i_output = nullptr, *device_nonlocal_i_ouput = nullptr; PetscMemType mtype; + Kokkos::fence(); PetscCallVoid(MatSeqAIJGetCSRAndMemType(mat_local_output, &device_local_i_output, NULL, NULL, &mtype)); if (mpi) PetscCallVoid(MatSeqAIJGetCSRAndMemType(mat_nonlocal_output, &device_nonlocal_i_ouput, NULL, NULL, &mtype)); @@ -727,6 +737,7 @@ PETSC_INTERN void compute_R_from_Z_kokkos(Mat *input_mat, PetscInt global_row_st IS *is_coarse, IS *orig_fine_col_indices, int identity_int, int reuse_int, int reuse_indices_int, \ Mat *output_mat) { + //PflareKokkosTrace _trace("compute_R_from_Z_kokkos"); MPI_Comm MPI_COMM_MATRIX; PetscInt global_row_start_Z, global_row_end_plus_one_Z; PetscInt global_col_start_Z, global_col_end_plus_one_Z; @@ -739,6 +750,8 @@ PETSC_INTERN void compute_R_from_Z_kokkos(Mat *input_mat, PetscInt global_row_st PetscInt nnzs_match_local, nnzs_match_nonlocal; Mat output_mat_local, output_mat_nonlocal; + mat_sync(input_mat); + PetscCallVoid(MatGetType(*input_mat, &mat_type)); // Are we in parallel? const bool mpi = strcmp(mat_type, MATMPIAIJKOKKOS) == 0; @@ -863,6 +876,7 @@ PETSC_INTERN void compute_R_from_Z_kokkos(Mat *input_mat, PetscInt global_row_st // ~~~~~~~~~~~~ // Get pointers to the i,j,vals on the device // ~~~~~~~~~~~~ + Kokkos::fence(); const PetscInt *device_local_i = nullptr, *device_local_j = nullptr, *device_nonlocal_i = nullptr, *device_nonlocal_j = nullptr; PetscMemType mtype; PetscScalar *device_local_vals = nullptr, *device_nonlocal_vals = nullptr; @@ -1070,6 +1084,7 @@ PETSC_INTERN void compute_R_from_Z_kokkos(Mat *input_mat, PetscInt global_row_st // Annoyingly there isn't currently the ability to get views for i (or j) const PetscInt *device_local_i_output = nullptr, *device_local_j_output = nullptr, *device_nonlocal_i_ouput = nullptr; PetscMemType mtype; + Kokkos::fence(); PetscCallVoid(MatSeqAIJGetCSRAndMemType(mat_local_output, &device_local_i_output, &device_local_j_output, NULL, &mtype)); if (mpi) PetscCallVoid(MatSeqAIJGetCSRAndMemType(mat_nonlocal_output, &device_nonlocal_i_ouput, NULL, NULL, &mtype)); @@ -1154,6 +1169,7 @@ PETSC_INTERN void compute_R_from_Z_kokkos(Mat *input_mat, PetscInt global_row_st // Now we have to sort the local column indices, as we add in the identity at the // end of our local j indices KokkosCsrMatrix csrmat_local = KokkosCsrMatrix("csrmat_local", local_rows_z, local_full_cols, a_local_d.extent(0), a_local_d, i_local_d, j_local_d); + Kokkos::fence(); KokkosSparse::sort_crs_matrix(csrmat_local); // Let's make sure everything on the device is finished diff --git a/src/MatDiagDomk.kokkos.cxx b/src/MatDiagDomk.kokkos.cxx index 3b1a7062..7eaebcbb 100644 --- a/src/MatDiagDomk.kokkos.cxx +++ b/src/MatDiagDomk.kokkos.cxx @@ -14,8 +14,13 @@ // This code is very similar to MatCreateSubMatrix_kokkos PETSC_INTERN void MatDiagDomRatio_kokkos(Mat *input_mat, PetscReal *max_dd_ratio_achieved, PetscInt *local_rows_aff) { + //PflareKokkosTrace _trace("MatDiagDomRatio_kokkos"); PetscInt local_rows, local_cols; + Kokkos::fence(); + + mat_sync(input_mat); + // Are we in parallel? MatType mat_type; MPI_Comm MPI_COMM_MATRIX; @@ -45,6 +50,7 @@ PETSC_INTERN void MatDiagDomRatio_kokkos(Mat *input_mat, PetscReal *max_dd_ratio intKokkosView cf_markers_d = cf_markers_local_d; intKokkosView cf_markers_nonlocal_d; Vec scatter_root_vec = NULL; + Vec scatter_leaf_vec = NULL; PetscIntKokkosView is_fine_local_d; auto exec = PetscGetKokkosExecutionSpace(); @@ -60,6 +66,8 @@ PETSC_INTERN void MatDiagDomRatio_kokkos(Mat *input_mat, PetscReal *max_dd_ratio diag_dom_ratio_local_d = PetscScalarKokkosView("diag_dom_ratio_local_d", local_rows_row); PetscScalarKokkosView diag_dom_ratio_d = diag_dom_ratio_local_d; + Kokkos::fence(); + // ~~~~~~~~~~~~~~~ // Can now go and compute the diagonal dominance sums // ~~~~~~~~~~~~~~~ @@ -72,6 +80,7 @@ PETSC_INTERN void MatDiagDomRatio_kokkos(Mat *input_mat, PetscReal *max_dd_ratio // Scatter cf_markers via VecScatter (int -> PetscScalar conversion required) PetscCallVoid(MatCreateVecs(*input_mat, &scatter_root_vec, NULL)); + PetscCallVoid(VecDuplicate(mat_mpi->lvec, &scatter_leaf_vec)); { PetscScalarKokkosView root_scalar_d; PetscCallVoid(VecGetKokkosViewWrite(scatter_root_vec, &root_scalar_d)); @@ -85,8 +94,9 @@ PETSC_INTERN void MatDiagDomRatio_kokkos(Mat *input_mat, PetscReal *max_dd_ratio // Start comms, then overlap with local-only work below. // Mvctx must have only one active comm at a time. // Ensure send/receive buffers are stable before Begin. - Kokkos::fence(); - PetscCallVoid(VecScatterBegin(mat_mpi->Mvctx, scatter_root_vec, mat_mpi->lvec, INSERT_VALUES, SCATTER_FORWARD)); + Kokkos::fence(); + PetscCallVoid(VecScatterBegin(mat_mpi->Mvctx, scatter_root_vec, scatter_leaf_vec, INSERT_VALUES, SCATTER_FORWARD)); + PetscCallVoid(VecScatterEnd(mat_mpi->Mvctx, scatter_root_vec, scatter_leaf_vec, INSERT_VALUES, SCATTER_FORWARD)); } // ~~~~~~~~~~~~~~~ @@ -96,6 +106,7 @@ PETSC_INTERN void MatDiagDomRatio_kokkos(Mat *input_mat, PetscReal *max_dd_ratio // ~~~~~~~~~~~~ // Get pointers to the local i,j,vals on the device // ~~~~~~~~~~~~ + Kokkos::fence(); const PetscInt *device_local_i = nullptr, *device_local_j = nullptr; PetscScalar *device_local_vals = nullptr; PetscCallVoid(MatSeqAIJGetCSRAndMemType(mat_local, &device_local_i, &device_local_j, &device_local_vals, &mtype)); @@ -104,6 +115,8 @@ PETSC_INTERN void MatDiagDomRatio_kokkos(Mat *input_mat, PetscReal *max_dd_ratio PetscScalarKokkosView diag_entry_d = PetscScalarKokkosView("diag_entry_d", local_rows_row); Kokkos::deep_copy(exec, diag_entry_d, 0); + Kokkos::fence(); + // Scoping to reduce peak memory { // We now go and do a reduce to get the diagonal entry, while also @@ -153,23 +166,27 @@ PETSC_INTERN void MatDiagDomRatio_kokkos(Mat *input_mat, PetscReal *max_dd_ratio diag_dom_ratio_d(i_idx_is_row) = sum_val; }); }); + Kokkos::fence(); } // Finish the in-flight scatter and only then read from the receive buffer. if (mpi) { - PetscCallVoid(VecScatterEnd(mat_mpi->Mvctx, scatter_root_vec, mat_mpi->lvec, INSERT_VALUES, SCATTER_FORWARD)); + Kokkos::fence(); { ConstPetscScalarKokkosView lvec_scalar_d; - PetscCallVoid(VecGetKokkosView(mat_mpi->lvec, &lvec_scalar_d)); + PetscCallVoid(VecGetKokkosView(scatter_leaf_vec, &lvec_scalar_d)); Kokkos::parallel_for( Kokkos::RangePolicy<>(exec, 0, cols_ao), KOKKOS_LAMBDA(PetscInt i) { cf_markers_nonlocal_d(i) = (int)lvec_scalar_d(i); }); - PetscCallVoid(VecRestoreKokkosView(mat_mpi->lvec, &lvec_scalar_d)); + PetscCallVoid(VecRestoreKokkosView(scatter_leaf_vec, &lvec_scalar_d)); } - PetscCallVoid(VecDestroy(&scatter_root_vec)); + // Ensure the async parallel_for reading scatter_leaf_vec's device memory has completed + // before VecDestroy frees it. Kokkos::fence(); + PetscCallVoid(VecDestroy(&scatter_root_vec)); + PetscCallVoid(VecDestroy(&scatter_leaf_vec)); } // ~~~~~~~~~~~~~~~ @@ -184,6 +201,7 @@ PETSC_INTERN void MatDiagDomRatio_kokkos(Mat *input_mat, PetscReal *max_dd_ratio // ~~~~~~~~~~~~ // Get pointers to the nonlocal i,j,vals on the device // ~~~~~~~~~~~~ + Kokkos::fence(); const PetscInt *device_nonlocal_i = nullptr, *device_nonlocal_j = nullptr; PetscScalar *device_nonlocal_vals = nullptr; PetscCallVoid(MatSeqAIJGetCSRAndMemType(mat_nonlocal, &device_nonlocal_i, &device_nonlocal_j, &device_nonlocal_vals, &mtype)); @@ -224,9 +242,12 @@ PETSC_INTERN void MatDiagDomRatio_kokkos(Mat *input_mat, PetscReal *max_dd_ratio diag_dom_ratio_d(i_idx_is_row) += sum_val; }); }); + Kokkos::fence(); } } + Kokkos::fence(); + // ~~~~~~~~~~~~~ // Compute the diag dominance ratio // ~~~~~~~~~~~~~ @@ -254,6 +275,8 @@ PETSC_INTERN void MatDiagDomRatio_kokkos(Mat *input_mat, PetscReal *max_dd_ratio Kokkos::Max(max_dd_ratio_local) ); + Kokkos::fence(); + PetscCallMPIAbort(MPI_COMM_MATRIX, MPI_Allreduce(&max_dd_ratio_local, max_dd_ratio_achieved, 1, MPI_DOUBLE, MPI_MAX, MPI_COMM_MATRIX)); return; diff --git a/src/PETSc_Helperk.kokkos.cxx b/src/PETSc_Helperk.kokkos.cxx index 464001a5..357719b4 100644 --- a/src/PETSc_Helperk.kokkos.cxx +++ b/src/PETSc_Helperk.kokkos.cxx @@ -11,6 +11,7 @@ static PetscErrorCode check_exact_petscint_to_scalar_encoding(PetscInt max_encoded_value, MPI_Comm comm) { PetscFunctionBegin; + PflareKokkosTrace _trace("check_exact_petscint_to_scalar_encoding"); if (max_encoded_value <= 0) PetscFunctionReturn(PETSC_SUCCESS); const int digits = std::numeric_limits::digits; @@ -25,80 +26,124 @@ static PetscErrorCode check_exact_petscint_to_scalar_encoding(PetscInt max_encod //------------------------------------------------------------------------------------------------------------------------ -// Generate the colmap and rewrite input global j indices to local given the calculated colmap -PETSC_INTERN void rewrite_j_global_to_local(PetscInt colmap_max_size, PetscInt &col_ao_output, PetscIntKokkosView j_nonlocal_d, PetscInt **garray_host) +// Sync the kokkos parts of the matrix to make sure they're up to date +PETSC_INTERN void mat_sync(Mat *X) +{ + //PflareKokkosTrace _trace("mat_sync"); + MatType mat_type; + PetscCallVoid(MatGetType(*X, &mat_type)); + // Are we in parallel? + const bool mpi = strcmp(mat_type, MATMPIAIJKOKKOS) == 0; + Mat mat_local_x = NULL, mat_nonlocal_x = NULL; + + const PetscInt *colmap_x; + if (mpi) + { + PetscCallVoid(MatMPIAIJGetSeqAIJ(*X, &mat_local_x, &mat_nonlocal_x, &colmap_x)); + } + else + { + mat_local_x = *X; + } + + Mat_SeqAIJKokkos *mat_local_xkok = static_cast(mat_local_x->spptr); + if (mat_local_xkok->a_dual.need_sync_device()) { + mat_local_xkok->a_dual.sync_device(); + mat_local_xkok->transpose_updated = PETSC_FALSE; /* values of the transpose is out-of-date */ + mat_local_xkok->hermitian_updated = PETSC_FALSE; + } + if (mpi) + { + Mat_SeqAIJKokkos *mat_nonlocal_xkok = static_cast(mat_nonlocal_x->spptr); + if (mat_nonlocal_xkok->a_dual.need_sync_device()) { + mat_nonlocal_xkok->a_dual.sync_device(); + mat_nonlocal_xkok->transpose_updated = PETSC_FALSE; /* values of the transpose is out-of-date */ + mat_nonlocal_xkok->hermitian_updated = PETSC_FALSE; + } + } +} + +//------------------------------------------------------------------------------------------------------------------------ + +// Remap each entry in j_d from a global index to its local index via binary search into garray_d. +// garray_d must be a sorted array of unique global indices. +// Fences internally. +static void remap_j_to_local_device(PetscIntKokkosView j_d, PetscIntKokkosView garray_d, PetscInt col_ao_output) { + //PflareKokkosTrace _trace("remap_j_to_local_device"); + auto exec = PetscGetKokkosExecutionSpace(); + + if (j_d.extent(0) == 0) return; + Kokkos::parallel_for( + Kokkos::RangePolicy<>(exec, 0, j_d.extent(0)), KOKKOS_LAMBDA(const PetscInt i) { + j_d(i) = binary_search_sorted(garray_d, col_ao_output, j_d(i)); + }); + Kokkos::fence(); +} + +//------------------------------------------------------------------------------------------------------------------------ + +// Build garray on device from global indices in j_nonlocal_d and remap j_nonlocal_d to local in-place. +// garray_d (out) is a device view of the sorted unique global column indices (size col_ao_output). +static void rewrite_j_global_to_local_device(PetscInt colmap_max_size, PetscInt &col_ao_output, PetscIntKokkosView j_nonlocal_d, PetscIntKokkosView &garray_d) +{ + //PflareKokkosTrace _trace("rewrite_j_global_to_local_device"); auto exec = PetscGetKokkosExecutionSpace(); // Need to preallocate to the max size - PetscIntKokkosView colmap_output_d("colmap_output_d", colmap_max_size); + PetscIntKokkosView colmap_output_d("colmap_output_d", colmap_max_size); col_ao_output = 0; - // Take a copy of j and sort it and then build garray if (j_nonlocal_d.extent(0) > 0) { ptrdiff_t count_ptr_arith = -1; - // Scoped so we don't keep the copy of j around very long + // Scoped so we don't keep the sorted copy of j around very long { PetscIntKokkosView j_nonlocal_d_sorted("j_nonlocal_d_sorted", j_nonlocal_d.extent(0)); Kokkos::deep_copy(exec, j_nonlocal_d_sorted, j_nonlocal_d); - Kokkos::sort(j_nonlocal_d_sorted); + Kokkos::sort(exec, j_nonlocal_d_sorted); + Kokkos::fence(); // Unique copy returns a copy of sorted j_nonlocal_d_sorted in order, but with all the duplicate entries removed auto unique_end_it = Kokkos::Experimental::unique_copy(exec, j_nonlocal_d_sorted, colmap_output_d); + Kokkos::fence(); auto begin_it = Kokkos::Experimental::begin(colmap_output_d); count_ptr_arith = unique_end_it - begin_it; } col_ao_output = static_cast(count_ptr_arith); - // Create some host space for the output garray (that stays in scope) and copy it - PetscCallVoid(PetscMalloc1(col_ao_output, garray_host)); - PetscIntKokkosViewHost colmap_output_h = PetscIntKokkosViewHost(*garray_host, col_ao_output); PetscInt zero = 0; + garray_d = Kokkos::subview(colmap_output_d, Kokkos::make_pair(zero, col_ao_output)); + + // Remap j_nonlocal_d to local indices using binary search into garray_d + // This fences internally + remap_j_to_local_device(j_nonlocal_d, garray_d, col_ao_output); + } +} + +//------------------------------------------------------------------------------------------------------------------------ + +// Generate the colmap and rewrite input global j indices to local given the calculated colmap +PETSC_INTERN void rewrite_j_global_to_local(PetscInt colmap_max_size, PetscInt &col_ao_output, PetscIntKokkosView j_nonlocal_d, PetscInt **garray_host) +{ + //PflareKokkosTrace _trace("rewrite_j_global_to_local"); + auto exec = PetscGetKokkosExecutionSpace(); + PetscIntKokkosView garray_d; + + // This fences internally + rewrite_j_global_to_local_device(colmap_max_size, col_ao_output, j_nonlocal_d, garray_d); + + // Always allocate host array (even zero-size) + PetscCallVoid(PetscMalloc1(col_ao_output, garray_host)); + if (col_ao_output > 0) + { + PetscIntKokkosViewHost colmap_output_h = PetscIntKokkosViewHost(*garray_host, col_ao_output); // Device to host so don't need to specify exec space - Kokkos::deep_copy(colmap_output_h, Kokkos::subview(colmap_output_d, Kokkos::make_pair(zero, col_ao_output))); + Kokkos::deep_copy(exec, colmap_output_h, garray_d); + Kokkos::fence(); // Log copy with petsc size_t bytes = col_ao_output * sizeof(PetscInt); - PetscCallVoid(PetscLogGpuToCpu(bytes)); - } - - // ~~~~~~~~~~ - // Now we can go and overwrite the global indices in j with the local equivalents - // ~~~~~~~~~~ - // Do we have any nonlocal columns - if (col_ao_output == 0) - { - // Silly but depending on the compiler this may return a non-null pointer - col_ao_output = 0; - PetscCallVoid(PetscMalloc1(col_ao_output, garray_host)); - } - else - { - // Binary search sorted colmap to find our local index - // Originally used Kokkos::UnorderedMap here but it only handles up to uint32_t - // entries - Kokkos::parallel_for( - Kokkos::RangePolicy<>(exec, 0, j_nonlocal_d.extent(0)), KOKKOS_LAMBDA(const PetscInt i) { - - PetscInt low = 0; - PetscInt count = col_ao_output; // Number of elements in colmap_output_d - PetscInt step = -1; - PetscInt mid_idx = -1; - - while (count > 0) { - step = count / 2; - mid_idx = low + step; - if (colmap_output_d(mid_idx) < j_nonlocal_d(i)) { - low = mid_idx + 1; - count -= (step + 1); - } else { - count = step; - } - } - j_nonlocal_d(i) = low; - }); - // Ensure the rewrite is finished before we return - Kokkos::fence(); + PetscCallVoid(PetscLogGpuToCpu(bytes)); } } @@ -109,6 +154,7 @@ PETSC_INTERN void remove_small_from_sparse_kokkos(Mat *input_mat, const PetscRea const int relative_max_row_tolerance_int, const int lump_int, \ const int allow_drop_diagonal_int, const int allow_diag_strength_int) { + //PflareKokkosTrace _trace("remove_small_from_sparse_kokkos"); MPI_Comm MPI_COMM_MATRIX; PetscInt local_rows, local_cols, global_rows, global_cols; PetscInt global_row_start_temp, global_row_end_plus_one_temp; @@ -118,6 +164,9 @@ PETSC_INTERN void remove_small_from_sparse_kokkos(Mat *input_mat, const PetscRea PetscInt nnzs_match_local, nnzs_match_nonlocal; Mat output_mat_local, output_mat_nonlocal; + // Equivalent to calling MatSeqAIJKokkosSyncDevice which is petsc intern + mat_sync(input_mat); + PetscCallVoid(MatGetType(*input_mat, &mat_type)); // Are we in parallel? const bool mpi = strcmp(mat_type, MATMPIAIJKOKKOS) == 0; @@ -160,6 +209,7 @@ PETSC_INTERN void remove_small_from_sparse_kokkos(Mat *input_mat, const PetscRea // ~~~~~~~~~~~~ // Get pointers to the i,j,vals on the device // ~~~~~~~~~~~~ + Kokkos::fence(); const PetscInt *device_local_i = nullptr, *device_local_j = nullptr, *device_nonlocal_i = nullptr, *device_nonlocal_j = nullptr; PetscMemType mtype; PetscScalar *device_local_vals = nullptr, *device_nonlocal_vals = nullptr; @@ -749,9 +799,19 @@ PETSC_INTERN void remove_small_from_sparse_kokkos(Mat *input_mat, const PetscRea // Let's make sure everything on the device is finished Kokkos::fence(); + // Convert j_nonlocal_d from global to local indices now, before any sort below. + // All global indices (including any diagonals added in the loop above) are finalised. + // garray_d holds the sorted unique global column indices on device. + PetscIntKokkosView garray_d; + PetscInt col_ao_output = 0; + if (mpi) { + // This fences internally + rewrite_j_global_to_local_device(cols_ao, col_ao_output, j_nonlocal_d, garray_d); + } + // Now we may have to sort the column indices if (lump_int) - { + { // Reduce to see if we ever added a diagonal bool added_any_diagonal = false; Kokkos::parallel_reduce( @@ -761,21 +821,22 @@ PETSC_INTERN void remove_small_from_sparse_kokkos(Mat *input_mat, const PetscRea if (!existing_diag_d(i)) thread_result = true; }, Kokkos::LOr(added_any_diagonal) - ); + ); // If we did add a diagonal, it got added to the end of the columns on each row, so will have to sort // It also could have been added to either the local or nonlocal components given not square - if (added_any_diagonal) + if (added_any_diagonal) { - KokkosCsrMatrix csrmat_local = KokkosCsrMatrix("csrmat_local", local_rows, local_cols, a_local_d.extent(0), a_local_d, i_local_d, j_local_d); - KokkosSparse::sort_crs_matrix(csrmat_local); - + KokkosCsrMatrix csrmat_local = KokkosCsrMatrix("csrmat_local", local_rows, local_cols, a_local_d.extent(0), a_local_d, i_local_d, j_local_d); + Kokkos::fence(); + KokkosSparse::sort_crs_matrix(csrmat_local); + if (mpi) { - // The column size is not right here (it will be <= cols_ao) - // but it shouldn't matter as we are only construting an explicit kokkos csr matrix here so it can sort - KokkosCsrMatrix csrmat_nonlocal = KokkosCsrMatrix("csrmat_nonlocal", local_rows, cols_ao, a_nonlocal_d.extent(0), a_nonlocal_d, i_nonlocal_d, j_nonlocal_d); - KokkosSparse::sort_crs_matrix(csrmat_nonlocal); + // j_nonlocal_d now contains local indices; use col_ao_output as numCols + KokkosCsrMatrix csrmat_nonlocal = KokkosCsrMatrix("csrmat_nonlocal", local_rows, col_ao_output, a_nonlocal_d.extent(0), a_nonlocal_d, i_nonlocal_d, j_nonlocal_d); + Kokkos::fence(); + KokkosSparse::sort_crs_matrix(csrmat_nonlocal); } } } @@ -786,23 +847,28 @@ PETSC_INTERN void remove_small_from_sparse_kokkos(Mat *input_mat, const PetscRea PetscCallVoid(MatCreateSeqAIJKokkosWithKokkosViews(PETSC_COMM_SELF, local_rows, local_cols, i_local_d, j_local_d, a_local_d, &output_mat_local)); // we also have to go and build the a, i, j for the non-local off-diagonal block - if (mpi) + if (mpi) { - // Now we need to build garray on the host and rewrite the j_nonlocal_d indices so they are local - // The default values here are for the case where we - // let petsc do it, it resets this internally in MatSetUpMultiply_MPIAIJ + // Copy device garray to host PetscInt *garray_host = NULL; - PetscInt col_ao_output = 0; - // This fences internally - rewrite_j_global_to_local(cols_ao, col_ao_output, j_nonlocal_d, &garray_host); + PetscCallVoid(PetscMalloc1(col_ao_output, &garray_host)); + if (col_ao_output > 0) + { + PetscIntKokkosViewHost garray_h(garray_host, col_ao_output); + // Device to host so don't need to specify exec space + Kokkos::deep_copy(exec, garray_h, garray_d); + Kokkos::fence(); + size_t bytes = col_ao_output * sizeof(PetscInt); + PetscCallVoid(PetscLogGpuToCpu(bytes)); + } // We can create our nonlocal diagonal block matrix directly on the device PetscCallVoid(MatCreateSeqAIJKokkosWithKokkosViews(PETSC_COMM_SELF, local_rows, col_ao_output, i_nonlocal_d, j_nonlocal_d, a_nonlocal_d, &output_mat_nonlocal)); // We can now create our MPI matrix PetscCallVoid(MatCreateMPIAIJWithSeqAIJ(MPI_COMM_MATRIX, global_rows, global_cols, output_mat_local, output_mat_nonlocal, garray_host, output_mat)); - } - // If in serial + } + // If in serial else { *output_mat = output_mat_local; @@ -817,6 +883,7 @@ PETSC_INTERN void remove_small_from_sparse_kokkos(Mat *input_mat, const PetscRea // Drop according to a existing sparsity in output_mat but with kokkos - keeping everything on the device PETSC_INTERN void remove_from_sparse_match_kokkos(Mat *input_mat, Mat *output_mat, const int lump_int, const int alpha_int, const PetscReal alpha) { + //PflareKokkosTrace _trace("remove_from_sparse_match_kokkos"); MPI_Comm MPI_COMM_MATRIX; PetscInt local_rows, local_cols, global_rows, global_cols; PetscInt global_row_start_temp, global_row_end_plus_one_temp; @@ -824,6 +891,9 @@ PETSC_INTERN void remove_from_sparse_match_kokkos(Mat *input_mat, Mat *output_ma PetscInt rows_ao_input, cols_ao_input, rows_ao_output, cols_ao_output; MatType mat_type; + // Equivalent to calling MatSeqAIJKokkosSyncDevice which is petsc intern + mat_sync(input_mat); + PetscCallVoid(MatGetType(*input_mat, &mat_type)); // Are we in parallel? const bool mpi = strcmp(mat_type, MATMPIAIJKOKKOS) == 0; @@ -882,6 +952,7 @@ PETSC_INTERN void remove_from_sparse_match_kokkos(Mat *input_mat, Mat *output_ma // ~~~~~~~~~~~~ // Get pointers to the i,j,vals on the device // ~~~~~~~~~~~~ + Kokkos::fence(); const PetscInt *device_local_i = nullptr, *device_local_j = nullptr, *device_nonlocal_i = nullptr, *device_nonlocal_j = nullptr; PetscMemType mtype; PetscScalar *device_local_vals = nullptr, *device_nonlocal_vals = nullptr; @@ -1160,6 +1231,7 @@ PETSC_INTERN void remove_from_sparse_match_kokkos(Mat *input_mat, Mat *output_ma // Set all the values of the matrix to val PETSC_INTERN void MatSetAllValues_kokkos(Mat *input_mat, PetscReal val) { + //PflareKokkosTrace _trace("MatSetAllValues_kokkos"); MatType mat_type; PetscCallVoid(MatGetType(*input_mat, &mat_type)); @@ -1186,6 +1258,7 @@ PETSC_INTERN void MatSetAllValues_kokkos(Mat *input_mat, PetscReal val) // ~~~~~~~~~~~~ // Get pointers to the i,j,vals on the device // ~~~~~~~~~~~~ + Kokkos::fence(); const PetscInt *device_local_i = nullptr, *device_local_j = nullptr, *device_nonlocal_i = nullptr, *device_nonlocal_j = nullptr; PetscMemType mtype; PetscScalar *device_local_vals = nullptr, *device_nonlocal_vals = nullptr; @@ -1205,6 +1278,7 @@ PETSC_INTERN void MatSetAllValues_kokkos(Mat *input_mat, PetscReal val) Kokkos::deep_copy(exec, a_nonlocal_d, val); PetscCallVoid(PetscLogCpuToGpu(bytes)); } + Kokkos::fence(); // Have to specify we've modifed data on the device // Want to call MatSeqAIJKokkosModifyDevice but its PETSC_INTERN @@ -1232,6 +1306,7 @@ PETSC_INTERN void MatSetAllValues_kokkos(Mat *input_mat, PetscReal val) // Duplicate and copy a matrix ensuring it always has a diagonal but with kokkos - keeping everything on the device PETSC_INTERN void mat_duplicate_copy_plus_diag_kokkos(Mat *input_mat, const int reuse_int, Mat *output_mat) { + //PflareKokkosTrace _trace("mat_duplicate_copy_plus_diag_kokkos"); MPI_Comm MPI_COMM_MATRIX; PetscInt global_row_start_temp, global_row_end_plus_one_temp; PetscInt global_col_start_temp, global_col_end_plus_one_temp; @@ -1242,6 +1317,9 @@ PETSC_INTERN void mat_duplicate_copy_plus_diag_kokkos(Mat *input_mat, const int PetscInt nnzs_match_local, nnzs_match_nonlocal; Mat output_mat_local, output_mat_nonlocal; + // Equivalent to calling MatSeqAIJKokkosSyncDevice which is petsc intern + mat_sync(input_mat); + PetscCallVoid(MatGetType(*input_mat, &mat_type)); // Are we in parallel? const bool mpi = strcmp(mat_type, MATMPIAIJKOKKOS) == 0; @@ -1273,6 +1351,7 @@ PETSC_INTERN void mat_duplicate_copy_plus_diag_kokkos(Mat *input_mat, const int // ~~~~~~~~~~~~ // Get pointers to the i,j,vals on the device // ~~~~~~~~~~~~ + Kokkos::fence(); const PetscInt *device_local_i = nullptr, *device_local_j = nullptr, *device_nonlocal_i = nullptr, *device_nonlocal_j = nullptr; PetscMemType mtype; PetscScalar *device_local_vals = nullptr, *device_nonlocal_vals = nullptr; @@ -1508,6 +1587,7 @@ PETSC_INTERN void mat_duplicate_copy_plus_diag_kokkos(Mat *input_mat, const int Kokkos::deep_copy(exec, a_local_d, 0.0); // Annoyingly there isn't currently the ability to get views for i (or j) + Kokkos::fence(); const PetscInt *device_local_i_output = nullptr, *device_local_j_output = nullptr, *device_nonlocal_i_ouput = nullptr; PetscMemType mtype; PetscCallVoid(MatSeqAIJGetCSRAndMemType(mat_local_output, &device_local_i_output, &device_local_j_output, NULL, &mtype)); @@ -1596,6 +1676,7 @@ PETSC_INTERN void mat_duplicate_copy_plus_diag_kokkos(Mat *input_mat, const int // Now we have to sort the local column indices, as we add in the identity at the // end of our local j indices KokkosCsrMatrix csrmat_local = KokkosCsrMatrix("csrmat_local", local_rows, local_cols, a_local_d.extent(0), a_local_d, i_local_d, j_local_d); + Kokkos::fence(); KokkosSparse::sort_crs_matrix(csrmat_local); // Let's make sure everything on the device is finished @@ -1637,6 +1718,7 @@ PETSC_INTERN void mat_duplicate_copy_plus_diag_kokkos(Mat *input_mat, const int // Does a MatAXPY for a MPIAIJ Kokkos matrix - the petsc version currently uses the host making it very slow PETSC_INTERN void MatAXPY_kokkos(Mat *Y, PetscScalar alpha, Mat *X) { + //PflareKokkosTrace _trace("MatAXPY_kokkos"); Mat mat_local_y = NULL, mat_nonlocal_y = NULL; Mat mat_local_x = NULL, mat_nonlocal_x = NULL; @@ -1644,33 +1726,10 @@ PETSC_INTERN void MatAXPY_kokkos(Mat *Y, PetscScalar alpha, Mat *X) PetscCallVoid(MatMPIAIJGetSeqAIJ(*Y, &mat_local_y, &mat_nonlocal_y, &colmap_y)); PetscCallVoid(MatMPIAIJGetSeqAIJ(*X, &mat_local_x, &mat_nonlocal_x, &colmap_x)); - Mat_SeqAIJKokkos *mat_local_ykok = static_cast(mat_local_y->spptr); - Mat_SeqAIJKokkos *mat_nonlocal_ykok = static_cast(mat_nonlocal_y->spptr); - Mat_SeqAIJKokkos *mat_local_xkok = static_cast(mat_local_x->spptr); - Mat_SeqAIJKokkos *mat_nonlocal_xkok = static_cast(mat_nonlocal_x->spptr); - // Equivalent to calling MatSeqAIJKokkosSyncDevice which is petsc intern - // We have to make sure the device data is up to date before we do the axpy - if (mat_local_ykok->a_dual.need_sync_device()) { - mat_local_ykok->a_dual.sync_device(); - mat_local_ykok->transpose_updated = PETSC_FALSE; /* values of the transpose is out-of-date */ - mat_local_ykok->hermitian_updated = PETSC_FALSE; - } - if (mat_nonlocal_ykok->a_dual.need_sync_device()) { - mat_nonlocal_ykok->a_dual.sync_device(); - mat_nonlocal_ykok->transpose_updated = PETSC_FALSE; /* values of the transpose is out-of-date */ - mat_nonlocal_ykok->hermitian_updated = PETSC_FALSE; - } - if (mat_local_xkok->a_dual.need_sync_device()) { - mat_local_xkok->a_dual.sync_device(); - mat_local_xkok->transpose_updated = PETSC_FALSE; /* values of the transpose is out-of-date */ - mat_local_xkok->hermitian_updated = PETSC_FALSE; - } - if (mat_nonlocal_xkok->a_dual.need_sync_device()) { - mat_nonlocal_xkok->a_dual.sync_device(); - mat_nonlocal_xkok->transpose_updated = PETSC_FALSE; /* values of the transpose is out-of-date */ - mat_nonlocal_xkok->hermitian_updated = PETSC_FALSE; - } + // We have to make sure the device data is up to date before we do the axpy + mat_sync(X); + mat_sync(Y); PetscInt rows_ao_y, cols_ao_y, rows_ao_x, cols_ao_x; auto exec = PetscGetKokkosExecutionSpace(); @@ -1703,6 +1762,7 @@ PETSC_INTERN void MatAXPY_kokkos(Mat *Y, PetscScalar alpha, Mat *X) // ~~~~~~~~~~~~~~~ // Let's go and add the local components together // ~~~~~~~~~~~~~~~ + Kokkos::fence(); Mat_SeqAIJKokkos *xkok_local, *ykok_local; ykok_local = static_cast(mat_local_y->spptr); @@ -1716,8 +1776,8 @@ PETSC_INTERN void MatAXPY_kokkos(Mat *Y, PetscScalar alpha, Mat *X) KernelHandle kh_local; kh_local.create_spadd_handle(true); // X, Y are sorted - KokkosSparse::spadd_symbolic(&kh_local, xkok_local->csrmat, ykok_local->csrmat, zcsr_local); - KokkosSparse::spadd_numeric(&kh_local, alpha, xkok_local->csrmat, (PetscScalar)1.0, ykok_local->csrmat, zcsr_local); + KokkosSparse::spadd_symbolic(exec, &kh_local, xkok_local->csrmat, ykok_local->csrmat, zcsr_local); + KokkosSparse::spadd_numeric(exec, &kh_local, alpha, xkok_local->csrmat, (PetscScalar)1.0, ykok_local->csrmat, zcsr_local); kh_local.destroy_spadd_handle(); @@ -1731,10 +1791,14 @@ PETSC_INTERN void MatAXPY_kokkos(Mat *Y, PetscScalar alpha, Mat *X) i_local_d_copy = Kokkos::View("i_local_d_copy", i_local_d_z.extent(0)); j_local_d_copy = Kokkos::View("j_local_d_copy", j_local_d_z.extent(0)); + // Let's make sure everything on the device is finished + Kokkos::fence(); + Kokkos::deep_copy(exec, a_local_d_copy, a_local_d_z); Kokkos::deep_copy(exec, i_local_d_copy, i_local_d_z); Kokkos::deep_copy(exec, j_local_d_copy, j_local_d_z); } + Kokkos::fence(); // We can create our local diagonal block matrix directly on the device Mat Z_local; @@ -1775,26 +1839,74 @@ PETSC_INTERN void MatAXPY_kokkos(Mat *Y, PetscScalar alpha, Mat *X) // Ensure everything is finished before we hit the spadd below Kokkos::fence(); + // ~~~~~~~~~ + // Build merged garray from the union of X and Y global nonlocal column indices. + // Then remap both to local indices so spadd sees correct column numbering. + // ~~~~~~~~~ + + PetscInt nnz_x = xkok_nonlocal->csrmat.nnz(); + PetscInt nnz_y = ykok_nonlocal->csrmat.nnz(); + PetscInt total_nnz_xy = nnz_x + nnz_y; + + // Non-owning views over the raw j data (already holds global indices at this point) + PetscIntKokkosView j_x_view(device_nonlocal_x_j, nnz_x); + PetscIntKokkosView j_y_view(device_nonlocal_y_j, nnz_y); + + PetscIntKokkosView garray_d; + PetscInt col_ao_output = 0; + + if (total_nnz_xy > 0) + { + // Concatenate all global j indices into one array, sort, unique → merged garray + PetscIntKokkosView combined_j_d("combined_j_d", total_nnz_xy); + Kokkos::deep_copy(exec, Kokkos::subview(combined_j_d, Kokkos::make_pair((PetscInt)0, nnz_x)), j_x_view); + Kokkos::deep_copy(exec, Kokkos::subview(combined_j_d, Kokkos::make_pair(nnz_x, total_nnz_xy)), j_y_view); + Kokkos::sort(exec, combined_j_d); + Kokkos::fence(); + + PetscIntKokkosView garray_full_d("garray_full_d", total_nnz_xy); + auto unique_end_it = Kokkos::Experimental::unique_copy(exec, combined_j_d, garray_full_d); + Kokkos::fence(); + col_ao_output = static_cast(unique_end_it - Kokkos::Experimental::begin(garray_full_d)); + PetscInt zero = 0; + garray_d = Kokkos::subview(garray_full_d, Kokkos::make_pair(zero, col_ao_output)); + + // Remap j_y and j_x from global to local indices into the merged garray + // These fence internally + remap_j_to_local_device(j_y_view, garray_d, col_ao_output); + remap_j_to_local_device(j_x_view, garray_d, col_ao_output); + } + // ~~~~~~~~~ Kokkos::View a_nonlocal_d_copy; Kokkos::View i_nonlocal_d_copy, j_nonlocal_d_copy; PetscInt *garray_host = NULL; - PetscInt col_ao_output = 0; - // Scope so the zcsr_nonlocal is destroyed once we copy + // Scope so the zcsr_nonlocal is destroyed once we copy { - // Now we can add the non-local components together + // Create csrmat wrappers for X and Y with the correct merged numCols + KokkosCsrMatrix xcsrmat("x_nonlocal_remapped", local_rows, col_ao_output, + nnz_x, xkok_nonlocal->csrmat.values, + xkok_nonlocal->csrmat.graph.row_map, xkok_nonlocal->csrmat.graph.entries); + KokkosCsrMatrix ycsrmat("y_nonlocal_remapped", local_rows, col_ao_output, + nnz_y, ykok_nonlocal->csrmat.values, + ykok_nonlocal->csrmat.graph.row_map, ykok_nonlocal->csrmat.graph.entries); + + Kokkos::fence(); + // Now we can add the non-local components together. + // Local indices into the merged sorted garray preserve row-sort order. KokkosCsrMatrix zcsr_nonlocal; - // Global indices are sorted - KernelHandle kh_nonlocal; - kh_nonlocal.create_spadd_handle(true); + KernelHandle kh_nonlocal; + kh_nonlocal.create_spadd_handle(true); - KokkosSparse::spadd_symbolic(&kh_nonlocal, xkok_nonlocal->csrmat, ykok_nonlocal->csrmat, zcsr_nonlocal); - KokkosSparse::spadd_numeric(&kh_nonlocal, alpha, xkok_nonlocal->csrmat, (PetscScalar)1.0, ykok_nonlocal->csrmat, zcsr_nonlocal); + KokkosSparse::spadd_symbolic(exec, &kh_nonlocal, xcsrmat, ycsrmat, zcsr_nonlocal); + KokkosSparse::spadd_numeric(exec, &kh_nonlocal, alpha, xcsrmat, (PetscScalar)1.0, ycsrmat, zcsr_nonlocal); kh_nonlocal.destroy_spadd_handle(); + Kokkos::fence(); + // Can now destroy the copy PetscCallVoid(MatDestroy(&mat_nonlocal_x_copy)); @@ -1804,28 +1916,32 @@ PETSC_INTERN void MatAXPY_kokkos(Mat *Y, PetscScalar alpha, Mat *X) auto i_nonlocal_d_z = zcsr_nonlocal.graph.row_map; auto j_nonlocal_d_z = zcsr_nonlocal.graph.entries; - // We know the most nonlocal indices we can have are the addition of x and y - // (some might be the same) - PetscInt cols_ao = cols_ao_x + cols_ao_y; - - // ~~~~~~~~~ + // j_nonlocal_d_z already contains local indices; copy garray_d to host + PetscCallVoid(PetscMalloc1(col_ao_output, &garray_host)); + if (col_ao_output > 0) + { + PetscIntKokkosViewHost garray_h(garray_host, col_ao_output); + // Device to host so don't need to specify exec space + Kokkos::deep_copy(exec, garray_h, garray_d); + Kokkos::fence(); + size_t bytes = col_ao_output * sizeof(PetscInt); + PetscCallVoid(PetscLogGpuToCpu(bytes)); + } // Let's make sure everything on the device is finished - Kokkos::fence(); - - // Now we need to build garray on the host and rewrite the j_nonlocal_d_z indices so they are local - // This fences internally - rewrite_j_global_to_local(cols_ao, col_ao_output, j_nonlocal_d_z, &garray_host); + Kokkos::fence(); a_nonlocal_d_copy = Kokkos::View("a_local_d_copy", a_nonlocal_d_z.extent(0)); i_nonlocal_d_copy = Kokkos::View("i_local_d_copy", i_nonlocal_d_z.extent(0)); - j_nonlocal_d_copy = Kokkos::View("j_local_d_copy", j_nonlocal_d_z.extent(0)); + j_nonlocal_d_copy = Kokkos::View("j_local_d_copy", j_nonlocal_d_z.extent(0)); Kokkos::deep_copy(exec, a_nonlocal_d_copy, a_nonlocal_d_z); Kokkos::deep_copy(exec, i_nonlocal_d_copy, i_nonlocal_d_z); Kokkos::deep_copy(exec, j_nonlocal_d_copy, j_nonlocal_d_z); } + Kokkos::fence(); + // We can create our nonlocal diagonal block matrix directly on the device Mat Z_nonlocal; PetscCallVoid(MatCreateSeqAIJKokkosWithKokkosViews(PETSC_COMM_SELF, local_rows, col_ao_output, i_nonlocal_d_copy, j_nonlocal_d_copy, a_nonlocal_d_copy, &Z_nonlocal)); @@ -1847,6 +1963,7 @@ PETSC_INTERN void MatAXPY_kokkos(Mat *Y, PetscScalar alpha, Mat *X) // is_col must be sorted PETSC_INTERN void MatCreateSubMatrix_Seq_kokkos(Mat *input_mat, PetscIntKokkosView &is_row_d_d, PetscIntKokkosView &is_col_d_d, const int reuse_int, Mat *output_mat) { + //PflareKokkosTrace _trace("MatCreateSubMatrix_Seq_kokkos"); PetscInt local_rows, local_cols; PetscInt nnzs_match_local; auto exec = PetscGetKokkosExecutionSpace(); @@ -1857,6 +1974,7 @@ PETSC_INTERN void MatCreateSubMatrix_Seq_kokkos(Mat *input_mat, PetscIntKokkosVi // ~~~~~~~~~~~~ // Get pointers to the i,j,vals on the device // ~~~~~~~~~~~~ + Kokkos::fence(); const PetscInt *device_local_i = nullptr, *device_local_j = nullptr; PetscMemType mtype; PetscScalar *device_local_vals = nullptr; @@ -2063,6 +2181,7 @@ PETSC_INTERN void MatCreateSubMatrix_Seq_kokkos(Mat *input_mat, PetscIntKokkosVi a_local_d = aijkok_local_output->a_dual.view_device(); // Annoyingly there isn't currently the ability to get views for i (or j) + Kokkos::fence(); const PetscInt *device_local_i_output = nullptr; PetscMemType mtype; PetscCallVoid(MatSeqAIJGetCSRAndMemType(*output_mat, &device_local_i_output, NULL, NULL, &mtype)); @@ -2170,6 +2289,7 @@ PETSC_INTERN void MatCreateSubMatrix_Seq_kokkos(Mat *input_mat, PetscIntKokkosVi PETSC_INTERN void MatCreateSubMatrix_kokkos_view(Mat *input_mat, PetscIntKokkosView &is_row_d_d, PetscInt global_rows_row, \ PetscIntKokkosView &is_col_d_d, PetscInt global_cols_col, const int reuse_int, Mat *output_mat) { + //PflareKokkosTrace _trace("MatCreateSubMatrix_kokkos_view"); PetscInt local_rows, local_cols; PetscInt global_rows, global_cols; PetscInt global_row_start, global_row_end_plus_one; @@ -2232,19 +2352,21 @@ PETSC_INTERN void MatCreateSubMatrix_kokkos_view(Mat *input_mat, PetscIntKokkosV // Can't rely on PetscSFBcast with MPIU_INT as that was intermittently breaking // on gpus so want to avoid PetscInt max_encoded_value = global_cols_col > 0 ? global_cols_col - 1 : 0; - PetscCallVoid(check_exact_petscint_to_scalar_encoding(max_encoded_value, MPI_COMM_MATRIX)); + //PetscCallVoid(check_exact_petscint_to_scalar_encoding(max_encoded_value, MPI_COMM_MATRIX)); // Kokkos version of ISGetSeqIS_SameColDist_Private (mpiaij.c) // Uses VecScatter with PetscScalar Vecs (matching PETSc's own pattern) // instead of direct PetscSFBcast with MPIU_INT on temporary views. + //std::cerr << "one " << std::endl; + /* (1) iscol is a sub-column vector of mat, pad it with '-1.' to form a full vector x */ Vec x_vec, cmap_vec; PetscCallVoid(MatCreateVecs(*input_mat, &x_vec, NULL)); PetscCallVoid(VecDuplicate(x_vec, &cmap_vec)); // Fill x_vec on device: x[is_col(i)] = is_col(i), rest = -1 - { + PetscScalarKokkosView x_scalar_d; PetscCallVoid(VecGetKokkosViewWrite(x_vec, &x_scalar_d)); Kokkos::deep_copy(exec, x_scalar_d, -1.0); @@ -2253,17 +2375,27 @@ PETSC_INTERN void MatCreateSubMatrix_kokkos_view(Mat *input_mat, PetscIntKokkosV x_scalar_d(is_col_d_d(i)) = (PetscScalar)is_col_d_d(i); }); PetscCallVoid(VecRestoreKokkosViewWrite(x_vec, &x_scalar_d)); - } + + + //std::cerr << "two " << std::endl; /* (2) Scatter x and cmap using Mvctx to get their off-process portions */ // Keep at most one active communication on Mvctx at a time. // While Begin/End is in flight, do not touch the corresponding send/recv buffers. + Vec x_leaf_vec; + PetscCallVoid(VecDuplicate(mat_mpi->lvec, &x_leaf_vec)); // Ensure send/receive buffers are stable before Begin. - Kokkos::fence(); - PetscCallVoid(VecScatterBegin(mat_mpi->Mvctx, x_vec, mat_mpi->lvec, INSERT_VALUES, SCATTER_FORWARD)); + Kokkos::fence(); + //std::cerr << "two a " << std::endl; + + PetscCallVoid(VecScatterBegin(mat_mpi->Mvctx, x_vec, x_leaf_vec, INSERT_VALUES, SCATTER_FORWARD)); + // x scatter completed: x_leaf_vec is now safe to read. + PetscCallVoid(VecScatterEnd(mat_mpi->Mvctx, x_vec, x_leaf_vec, INSERT_VALUES, SCATTER_FORWARD)); + + //std::cerr << "two b" << std::endl; // Fill cmap_vec on device: cmap[is_col(i)] = i + isstart, rest = -1 - { + PetscScalarKokkosView cmap_scalar_d; PetscCallVoid(VecGetKokkosViewWrite(cmap_vec, &cmap_scalar_d)); Kokkos::deep_copy(exec, cmap_scalar_d, -1.0); @@ -2272,7 +2404,8 @@ PETSC_INTERN void MatCreateSubMatrix_kokkos_view(Mat *input_mat, PetscIntKokkosV cmap_scalar_d(is_col_d_d(i)) = (PetscScalar)(i + isstart); }); PetscCallVoid(VecRestoreKokkosViewWrite(cmap_vec, &cmap_scalar_d)); - } + + //std::cerr << "three " << std::endl; Vec lcmap_vec; PetscCallVoid(VecDuplicate(mat_mpi->lvec, &lcmap_vec)); @@ -2284,18 +2417,17 @@ PETSC_INTERN void MatCreateSubMatrix_kokkos_view(Mat *input_mat, PetscIntKokkosV auto is_col_o_match_d = PetscIntKokkosView("is_col_o_match_d", cols_ao+1); Kokkos::deep_copy(exec, is_col_o_match_d, 0); - // x scatter completed: mat_mpi->lvec is now safe to read. - PetscCallVoid(VecScatterEnd(mat_mpi->Mvctx, x_vec, mat_mpi->lvec, INSERT_VALUES, SCATTER_FORWARD)); - // Start cmap scatter only after finishing x scatter on the same Mvctx. // Ensure send/receive buffers are stable before Begin. Kokkos::fence(); PetscCallVoid(VecScatterBegin(mat_mpi->Mvctx, cmap_vec, lcmap_vec, INSERT_VALUES, SCATTER_FORWARD)); + // cmap scatter completed: lcmap_vec is now safe to read. + PetscCallVoid(VecScatterEnd(mat_mpi->Mvctx, cmap_vec, lcmap_vec, INSERT_VALUES, SCATTER_FORWARD)); - if (cols_ao > 0) - { + //if (cols_ao > 0) + //{ ConstPetscScalarKokkosView lvec_scalar_d; - PetscCallVoid(VecGetKokkosView(mat_mpi->lvec, &lvec_scalar_d)); + PetscCallVoid(VecGetKokkosView(x_leaf_vec, &lvec_scalar_d)); Kokkos::parallel_reduce("FindMatches", Kokkos::RangePolicy<>(exec, 0, cols_ao), KOKKOS_LAMBDA(const PetscInt i, PetscInt& thread_sum) { @@ -2309,8 +2441,11 @@ PETSC_INTERN void MatCreateSubMatrix_kokkos_view(Mat *input_mat, PetscIntKokkosV Kokkos::Sum(col_ao_output) ); - PetscCallVoid(VecRestoreKokkosView(mat_mpi->lvec, &lvec_scalar_d)); - } + PetscCallVoid(VecRestoreKokkosView(x_leaf_vec, &lvec_scalar_d)); + //} + + //std::cerr << "four " << std::endl; + // Need to do an exclusive scan on is_col_o_match_d to get the new local indices // Have to remember to go up to cols_ao+1 @@ -2329,11 +2464,8 @@ PETSC_INTERN void MatCreateSubMatrix_kokkos_view(Mat *input_mat, PetscIntKokkosV is_col_o_d = PetscIntKokkosView("is_col_o_d", col_ao_output); garray_output_d = PetscIntKokkosView("garray_output_d", col_ao_output); - // cmap scatter completed: lcmap_vec is now safe to read. - PetscCallVoid(VecScatterEnd(mat_mpi->Mvctx, cmap_vec, lcmap_vec, INSERT_VALUES, SCATTER_FORWARD)); - // Loop over all the cols in the input matrix - { + //{ ConstPetscScalarKokkosView lcmap_scalar_d; PetscCallVoid(VecGetKokkosView(lcmap_vec, &lcmap_scalar_d)); @@ -2352,10 +2484,14 @@ PETSC_INTERN void MatCreateSubMatrix_kokkos_view(Mat *input_mat, PetscIntKokkosV Kokkos::fence(); PetscCallVoid(VecRestoreKokkosView(lcmap_vec, &lcmap_scalar_d)); - } + //} + + //std::cerr << "five " << std::endl; + // Cleanup Vecs PetscCallVoid(VecDestroy(&x_vec)); + PetscCallVoid(VecDestroy(&x_leaf_vec)); PetscCallVoid(VecDestroy(&cmap_vec)); PetscCallVoid(VecDestroy(&lcmap_vec)); } @@ -2386,23 +2522,32 @@ PETSC_INTERN void MatCreateSubMatrix_kokkos_view(Mat *input_mat, PetscIntKokkosV } // We can now create the off-diagonal component + Kokkos::fence(); MatCreateSubMatrix_Seq_kokkos(&mat_nonlocal, is_row_d_d, is_col_o_d, reuse_int, &output_mat_nonlocal); // If it's our first time through we have to create our output matrix if (!reuse_int) { + //std::cerr << "six " << std::endl; + // Copy the garray output to the host PetscInt *garray_host = NULL; PetscCallVoid(PetscMalloc1(garray_output_d.extent(0), &garray_host)); PetscIntKokkosViewHost colmap_output_h = PetscIntKokkosViewHost(garray_host, garray_output_d.extent(0)); // Copy the garray output to the host - Kokkos::deep_copy(colmap_output_h, garray_output_d); + Kokkos::deep_copy(exec, colmap_output_h, garray_output_d); + Kokkos::fence(); bytes = colmap_output_h.extent(0) * sizeof(PetscInt); PetscCallVoid(PetscLogGpuToCpu(bytes)); + + //std::cerr << "seven " << std::endl; + // We can now create our MPI matrix PetscCallVoid(MatCreateMPIAIJWithSeqAIJ(MPI_COMM_MATRIX, global_rows_row, global_cols_col, output_mat_local, output_mat_nonlocal, garray_host, output_mat)); + //std::cerr << "eight " << std::endl; + // ~~~~~~~~~~~~~~ // If this is the first time through, we need to store the iscol_o in the output_mat // We don't store the is_row_d_d or is_col_d_d like the host version does as they're super cheap to rebuild @@ -2412,7 +2557,8 @@ PETSC_INTERN void MatCreateSubMatrix_kokkos_view(Mat *input_mat, PetscIntKokkosV PetscCallVoid(PetscMalloc1(is_col_o_d.extent(0), &is_col_o_host)); PetscIntKokkosViewHost is_col_o_h = PetscIntKokkosViewHost(is_col_o_host, is_col_o_d.extent(0)); // Copy the is_col_o_d output to the host - Kokkos::deep_copy(is_col_o_h, is_col_o_d); + Kokkos::deep_copy(exec, is_col_o_h, is_col_o_d); + Kokkos::fence(); bytes = is_col_o_h.extent(0) * sizeof(PetscInt); PetscCallVoid(PetscLogGpuToCpu(bytes)); // Now create an IS @@ -2422,6 +2568,9 @@ PETSC_INTERN void MatCreateSubMatrix_kokkos_view(Mat *input_mat, PetscIntKokkosV PetscCallVoid(PetscObjectCompose((PetscObject)(*output_mat), "iscol_o", (PetscObject)iscol_o)); // The ref counter is incremented by the compose PetscCallVoid(ISDestroy(&iscol_o)); + + //std::cerr << "nine " << std::endl; + } } else @@ -2444,13 +2593,17 @@ PETSC_INTERN void MatCreateSubMatrix_kokkos(Mat *input_mat, IS *is_row, IS *is_c const int reuse_int, Mat *output_mat, \ const int our_level, const int is_row_fine_int, const int is_col_fine_int) { + //PflareKokkosTrace _trace("MatCreateSubMatrix_kokkos"); PetscInt global_row_start, global_row_end_plus_one; PetscInt global_col_start, global_col_end_plus_one; PetscCallVoid(MatGetOwnershipRange(*input_mat, &global_row_start, &global_row_end_plus_one)); PetscCallVoid(MatGetOwnershipRangeColumn(*input_mat, &global_col_start, &global_col_end_plus_one)); PetscInt global_rows_row, global_cols_col; PetscCallVoid(ISGetSize(*is_row, &global_rows_row)); - PetscCallVoid(ISGetSize(*is_col, &global_cols_col)); + PetscCallVoid(ISGetSize(*is_col, &global_cols_col)); + + // Equivalent to calling MatSeqAIJKokkosSyncDevice which is petsc intern + mat_sync(input_mat); PetscIntKokkosView is_row_d_d, is_col_d_d; const int level_idx = our_level - 1; diff --git a/src/PMISR_Modulek.kokkos.cxx b/src/PMISR_Modulek.kokkos.cxx index a7f804a8..861fb734 100644 --- a/src/PMISR_Modulek.kokkos.cxx +++ b/src/PMISR_Modulek.kokkos.cxx @@ -15,6 +15,7 @@ // This mirrors the CPU version pmisr_existing_measure_cf_markers in PMISR_Module.F90 PETSC_INTERN void pmisr_existing_measure_cf_markers_kokkos(Mat *strength_mat, const int max_luby_steps, const int pmis_int, PetscScalarKokkosView &measure_local_d, intKokkosView &cf_markers_d, const int zero_measure_c_point_int) { + //PflareKokkosTrace _trace("pmisr_existing_measure_cf_markers_kokkos"); MPI_Comm MPI_COMM_MATRIX; PetscInt local_rows, local_cols, global_rows, global_cols; @@ -50,12 +51,16 @@ PETSC_INTERN void pmisr_existing_measure_cf_markers_kokkos(Mat *strength_mat, co // ~~~~~~~~~~~~ // Get pointers to the i,j,vals on the device // ~~~~~~~~~~~~ + Kokkos::fence(); const PetscInt *device_local_i = nullptr, *device_local_j = nullptr, *device_nonlocal_i = nullptr, *device_nonlocal_j = nullptr; PetscMemType mtype; PetscScalar *device_local_vals = nullptr, *device_nonlocal_vals = nullptr; PetscCallVoid(MatSeqAIJGetCSRAndMemType(mat_local, &device_local_i, &device_local_j, &device_local_vals, &mtype)); if (mpi) PetscCallVoid(MatSeqAIJGetCSRAndMemType(mat_nonlocal, &device_nonlocal_i, &device_nonlocal_j, &device_nonlocal_vals, &mtype)); + + int loops_through = -1; + intKokkosView cf_markers_nonlocal_d; // Scratch buffer used for local update bookkeeping during overlap with reverse scatter. intKokkosView cf_markers_temp_d; @@ -75,8 +80,9 @@ PETSC_INTERN void pmisr_existing_measure_cf_markers_kokkos(Mat *strength_mat, co // Scatter the measure using VecScatter (matching PETSc's own buffer management) if (mpi) { - Vec measure_root_vec; + Vec measure_root_vec, measure_leaf_vec; PetscCallVoid(MatCreateVecs(*strength_mat, &measure_root_vec, NULL)); + PetscCallVoid(VecDuplicate(mat_mpi->lvec, &measure_leaf_vec)); { PetscScalarKokkosView root_scalar_d; PetscCallVoid(VecGetKokkosViewWrite(measure_root_vec, &root_scalar_d)); @@ -84,16 +90,20 @@ PETSC_INTERN void pmisr_existing_measure_cf_markers_kokkos(Mat *strength_mat, co PetscCallVoid(VecRestoreKokkosViewWrite(measure_root_vec, &root_scalar_d)); } // Ensure send/receive buffers are stable before Begin. - Kokkos::fence(); - PetscCallVoid(VecScatterBegin(mat_mpi->Mvctx, measure_root_vec, mat_mpi->lvec, INSERT_VALUES, SCATTER_FORWARD)); - PetscCallVoid(VecScatterEnd(mat_mpi->Mvctx, measure_root_vec, mat_mpi->lvec, INSERT_VALUES, SCATTER_FORWARD)); + Kokkos::fence(); + PetscCallVoid(VecScatterBegin(mat_mpi->Mvctx, measure_root_vec, measure_leaf_vec, INSERT_VALUES, SCATTER_FORWARD)); + PetscCallVoid(VecScatterEnd(mat_mpi->Mvctx, measure_root_vec, measure_leaf_vec, INSERT_VALUES, SCATTER_FORWARD)); { - ConstPetscScalarKokkosView lvec_scalar_d; - PetscCallVoid(VecGetKokkosView(mat_mpi->lvec, &lvec_scalar_d)); - Kokkos::deep_copy(exec, measure_nonlocal_d, lvec_scalar_d); - PetscCallVoid(VecRestoreKokkosView(mat_mpi->lvec, &lvec_scalar_d)); + ConstPetscScalarKokkosView leaf_scalar_d; + PetscCallVoid(VecGetKokkosView(measure_leaf_vec, &leaf_scalar_d)); + Kokkos::deep_copy(exec, measure_nonlocal_d, leaf_scalar_d); + PetscCallVoid(VecRestoreKokkosView(measure_leaf_vec, &leaf_scalar_d)); } + // Ensure the async deep_copy reading measure_leaf_vec's device memory has completed + // before VecDestroy frees it. + Kokkos::fence(); PetscCallVoid(VecDestroy(&measure_root_vec)); + PetscCallVoid(VecDestroy(&measure_leaf_vec)); } // Initialise the set @@ -162,7 +172,6 @@ PETSC_INTERN void pmisr_existing_measure_cf_markers_kokkos(Mat *strength_mat, co } // Let's keep track of how many times we go through the loops - int loops_through = -1; do { // Match the fortran version and include a pre-test on the do-while @@ -195,6 +204,8 @@ PETSC_INTERN void pmisr_existing_measure_cf_markers_kokkos(Mat *strength_mat, co // Ensure the root buffer is no longer being written before Begin. Kokkos::fence(); PetscCallVoid(VecScatterBegin(mat_mpi->Mvctx, scatter_root_vec, scatter_leaf_vec, INSERT_VALUES, SCATTER_FORWARD)); + // Complete the in-flight forward scatter before reading the receive buffer. + PetscCallVoid(VecScatterEnd(mat_mpi->Mvctx, scatter_root_vec, scatter_leaf_vec, INSERT_VALUES, SCATTER_FORWARD)); } @@ -261,9 +272,6 @@ PETSC_INTERN void pmisr_existing_measure_cf_markers_kokkos(Mat *strength_mat, co // ~~~~~~~~ if (mpi) { - // Complete the in-flight forward scatter before reading the receive buffer. - PetscCallVoid(VecScatterEnd(mat_mpi->Mvctx, scatter_root_vec, scatter_leaf_vec, INSERT_VALUES, SCATTER_FORWARD)); - // Convert PetscScalar → int after End, when the receive buffer is complete. { ConstPetscScalarKokkosView leaf_scalar_d; @@ -382,6 +390,8 @@ PETSC_INTERN void pmisr_existing_measure_cf_markers_kokkos(Mat *strength_mat, co // Ensure send/receive buffers are stable before Begin. Kokkos::fence(); PetscCallVoid(VecScatterBegin(mat_mpi->Mvctx, scatter_leaf_vec, scatter_root_vec, ADD_VALUES, SCATTER_REVERSE)); + // Complete reverse scatter before reading reduced root buffer. + PetscCallVoid(VecScatterEnd(mat_mpi->Mvctx, scatter_leaf_vec, scatter_root_vec, ADD_VALUES, SCATTER_REVERSE)); // While reverse scatter is in-flight, do local-only updates in cf_markers_temp_d. // This must not touch scatter_root_vec/scatter_leaf_vec. @@ -395,7 +405,7 @@ PETSC_INTERN void pmisr_existing_measure_cf_markers_kokkos(Mat *strength_mat, co // Check if this node was assigned during this top loop. // We read the temporary buffer here so we do not race with the // reduction into cf_markers_d. - if (cf_markers_temp_d(i) == 2) + if (Kokkos::atomic_load(&cf_markers_temp_d(i)) == 2) { const PetscInt ncols_local = device_local_i[i + 1] - device_local_i[i]; @@ -410,9 +420,6 @@ PETSC_INTERN void pmisr_existing_measure_cf_markers_kokkos(Mat *strength_mat, co } }); - // Complete reverse scatter before reading reduced root buffer. - PetscCallVoid(VecScatterEnd(mat_mpi->Mvctx, scatter_leaf_vec, scatter_root_vec, ADD_VALUES, SCATTER_REVERSE)); - // Convert PetscScalar → int back to cf_markers_d after End. { ConstPetscScalarKokkosView root_scalar_d; @@ -445,7 +452,7 @@ PETSC_INTERN void pmisr_existing_measure_cf_markers_kokkos(Mat *strength_mat, co // In serial there is no reduction, so we can // update cf_markers_d directly as before. - if (cf_markers_d(i) == loops_through) + if (Kokkos::atomic_load(&cf_markers_d(i)) == loops_through) { const PetscInt ncols_local = device_local_i[i + 1] - device_local_i[i]; @@ -522,6 +529,7 @@ PETSC_INTERN void pmisr_existing_measure_cf_markers_kokkos(Mat *strength_mat, co // See the full comments in the CPU version pmisr_existing_measure_implicit_transpose PETSC_INTERN void pmisr_existing_measure_implicit_transpose_kokkos(Mat *strength_mat, const int max_luby_steps, const int pmis_int, PetscScalarKokkosView &measure_local_d, intKokkosView &cf_markers_d, const int zero_measure_c_point_int) { + //PflareKokkosTrace _trace("pmisr_existing_measure_implicit_transpose_kokkos"); MPI_Comm MPI_COMM_MATRIX; PetscInt local_rows, local_cols, global_rows, global_cols; @@ -583,6 +591,8 @@ PETSC_INTERN void pmisr_existing_measure_implicit_transpose_kokkos(Mat *strength // This returns the global index of the local portion of the matrix PetscCallVoid(MatGetOwnershipRange(*strength_mat, &global_row_start, &global_row_end_plus_one)); + Kokkos::fence(); + // ~~~~~~~~~~~~ // Form the local S+S^T and get CSR pointers // We explicitly compute the local part of S+S^T so we don't have to @@ -603,6 +613,7 @@ PETSC_INTERN void pmisr_existing_measure_implicit_transpose_kokkos(Mat *strength // ~~~~~~~~~~~~ // Get pointers to the i,j on the device for all the matrices we need // ~~~~~~~~~~~~ + Kokkos::fence(); const PetscInt *device_local_i_spst = nullptr, *device_local_j_spst = nullptr; const PetscInt *device_nonlocal_i = nullptr, *device_nonlocal_j = nullptr; const PetscInt *device_nonlocal_i_transpose = nullptr, *device_nonlocal_j_transpose = nullptr; @@ -614,6 +625,8 @@ PETSC_INTERN void pmisr_existing_measure_implicit_transpose_kokkos(Mat *strength intKokkosView cf_markers_nonlocal_d; PetscScalarKokkosView measure_nonlocal_d; + Kokkos::fence(); + // ~~~~~~~~~~~~~~~ // veto stores whether a node has been veto'd as a candidate // .NOT. veto(i) means the node can be in the set @@ -634,8 +647,9 @@ PETSC_INTERN void pmisr_existing_measure_implicit_transpose_kokkos(Mat *strength // directly to PetscSF causes intermittent failures in parallel GPU runs (exact cause unknown). if (mpi) { - Vec measure_root_vec; + Vec measure_root_vec, measure_leaf_vec; PetscCallVoid(MatCreateVecs(*strength_mat, &measure_root_vec, NULL)); + PetscCallVoid(VecDuplicate(mat_mpi->lvec, &measure_leaf_vec)); { PetscScalarKokkosView root_scalar_d; PetscCallVoid(VecGetKokkosViewWrite(measure_root_vec, &root_scalar_d)); @@ -643,16 +657,20 @@ PETSC_INTERN void pmisr_existing_measure_implicit_transpose_kokkos(Mat *strength PetscCallVoid(VecRestoreKokkosViewWrite(measure_root_vec, &root_scalar_d)); } // Ensure send/receive buffers are stable before Begin. - Kokkos::fence(); - PetscCallVoid(VecScatterBegin(mat_mpi->Mvctx, measure_root_vec, mat_mpi->lvec, INSERT_VALUES, SCATTER_FORWARD)); - PetscCallVoid(VecScatterEnd(mat_mpi->Mvctx, measure_root_vec, mat_mpi->lvec, INSERT_VALUES, SCATTER_FORWARD)); + Kokkos::fence(); + PetscCallVoid(VecScatterBegin(mat_mpi->Mvctx, measure_root_vec, measure_leaf_vec, INSERT_VALUES, SCATTER_FORWARD)); + PetscCallVoid(VecScatterEnd(mat_mpi->Mvctx, measure_root_vec, measure_leaf_vec, INSERT_VALUES, SCATTER_FORWARD)); { ConstPetscScalarKokkosView lvec_scalar_d; - PetscCallVoid(VecGetKokkosView(mat_mpi->lvec, &lvec_scalar_d)); + PetscCallVoid(VecGetKokkosView(measure_leaf_vec, &lvec_scalar_d)); Kokkos::deep_copy(exec, measure_nonlocal_d, lvec_scalar_d); - PetscCallVoid(VecRestoreKokkosView(mat_mpi->lvec, &lvec_scalar_d)); + PetscCallVoid(VecRestoreKokkosView(measure_leaf_vec, &lvec_scalar_d)); } + // Ensure the async deep_copy reading measure_leaf_vec's device memory has completed + // before VecDestroy frees it. + Kokkos::fence(); PetscCallVoid(VecDestroy(&measure_root_vec)); + PetscCallVoid(VecDestroy(&measure_leaf_vec)); } // ~~~~~~~~~~~~ @@ -694,6 +712,8 @@ PETSC_INTERN void pmisr_existing_measure_implicit_transpose_kokkos(Mat *strength } }, counter_in_set_start); + Kokkos::fence(); + // Check the total number of undecided in parallel PetscInt counter_undecided, counter_parallel; if (max_luby_steps < 0) { @@ -722,6 +742,8 @@ PETSC_INTERN void pmisr_existing_measure_implicit_transpose_kokkos(Mat *strength PetscCallVoid(VecDuplicate(mat_mpi->lvec, &scatter_leaf_vec)); } + Kokkos::fence(); + // Let's keep track of how many times we go through the loops int loops_through = -1; do @@ -753,7 +775,7 @@ PETSC_INTERN void pmisr_existing_measure_implicit_transpose_kokkos(Mat *strength PetscCallVoid(VecRestoreKokkosViewWrite(scatter_root_vec, &root_scalar_d)); } // Ensure send/receive buffers are stable before Begin. - Kokkos::fence(); + Kokkos::fence(); PetscCallVoid(VecScatterBegin(mat_mpi->Mvctx, scatter_root_vec, scatter_leaf_vec, INSERT_VALUES, SCATTER_FORWARD)); PetscCallVoid(VecScatterEnd(mat_mpi->Mvctx, scatter_root_vec, scatter_leaf_vec, INSERT_VALUES, SCATTER_FORWARD)); // Convert PetscScalar → int @@ -768,6 +790,8 @@ PETSC_INTERN void pmisr_existing_measure_implicit_transpose_kokkos(Mat *strength } } + Kokkos::fence(); + // ~~~~~~~~ // Now we use veto to keep track of which candidates can be in the set // Locally we know which ones cannot be in the set due to local strong dependencies (mat_local), @@ -830,6 +854,8 @@ PETSC_INTERN void pmisr_existing_measure_implicit_transpose_kokkos(Mat *strength } }); + Kokkos::fence(); + // ~~~~~~~~ // Now let's go through and veto candidates which have strong influences on this rank // ie non-local nodes that influence local nodes through S^T @@ -882,6 +908,8 @@ PETSC_INTERN void pmisr_existing_measure_implicit_transpose_kokkos(Mat *strength }); } + Kokkos::fence(); + // Reduce the vetos with a lor via VecScatter ADD_VALUES SCATTER_REVERSE // (LOR is equivalent to sum when values are 0/1 bools) { @@ -903,7 +931,7 @@ PETSC_INTERN void pmisr_existing_measure_implicit_transpose_kokkos(Mat *strength PetscCallVoid(VecRestoreKokkosViewWrite(scatter_root_vec, &root_scalar_d)); } // Ensure send/receive buffers are stable before Begin. - Kokkos::fence(); + Kokkos::fence(); PetscCallVoid(VecScatterBegin(mat_mpi->Mvctx, scatter_leaf_vec, scatter_root_vec, ADD_VALUES, SCATTER_REVERSE)); PetscCallVoid(VecScatterEnd(mat_mpi->Mvctx, scatter_leaf_vec, scatter_root_vec, ADD_VALUES, SCATTER_REVERSE)); { @@ -916,6 +944,8 @@ PETSC_INTERN void pmisr_existing_measure_implicit_transpose_kokkos(Mat *strength PetscCallVoid(VecRestoreKokkosView(scatter_root_vec, &root_scalar_d)); } + Kokkos::fence(); + // Now the comms have finished, we know exactly which local nodes on this rank have no // local strong dependencies, influences, non-local influences but not yet non-local dependencies // Let's do the non-local dependencies and then now that the comms are done on veto_local_d @@ -956,6 +986,8 @@ PETSC_INTERN void pmisr_existing_measure_implicit_transpose_kokkos(Mat *strength }); } }); + + Kokkos::fence(); } // This cf_markers_d(i) = loops_through happens above in the case of mpi, saves a kernel launch else @@ -967,6 +999,8 @@ PETSC_INTERN void pmisr_existing_measure_implicit_transpose_kokkos(Mat *strength if (!veto_local_d(i)) cf_markers_d(i) = loops_through; }); + + Kokkos::fence(); } // ~~~~~~~~~~~~~ @@ -989,7 +1023,7 @@ PETSC_INTERN void pmisr_existing_measure_implicit_transpose_kokkos(Mat *strength const PetscInt i = t.league_rank(); // Check if this node has been assigned during this top loop - if (cf_markers_d(i) == loops_through) + if (Kokkos::atomic_load(&cf_markers_d(i)) == loops_through) { // Do the strong dependencies and influences PetscInt ncols_local = device_local_i_spst[i + 1] - device_local_i_spst[i]; @@ -1001,7 +1035,7 @@ PETSC_INTERN void pmisr_existing_measure_implicit_transpose_kokkos(Mat *strength // Skip the diagonal - we don't want to mark ourselves as a neighbor // Needs to be atomic as may being set by many threads - if (cf_markers_d(col) != 1 && col != i) + if (Kokkos::atomic_load(&cf_markers_d(col)) != 1 && col != i) { Kokkos::atomic_store(&cf_markers_d(col), 1); } @@ -1009,35 +1043,14 @@ PETSC_INTERN void pmisr_existing_measure_implicit_transpose_kokkos(Mat *strength } }); - // Now for the influences, we need to broadcast the cf_markers so that - // on other ranks we know which nodes have cf_markers_nonlocal_d(i) == loops_through - { - PetscScalarKokkosView root_scalar_d; - PetscCallVoid(VecGetKokkosViewWrite(scatter_root_vec, &root_scalar_d)); - Kokkos::parallel_for( - Kokkos::RangePolicy<>(exec, 0, local_rows), KOKKOS_LAMBDA(PetscInt i) { - root_scalar_d(i) = (PetscScalar)cf_markers_d(i); - }); - PetscCallVoid(VecRestoreKokkosViewWrite(scatter_root_vec, &root_scalar_d)); - } - // Ensure send/receive buffers are stable before Begin. - Kokkos::fence(); - PetscCallVoid(VecScatterBegin(mat_mpi->Mvctx, scatter_root_vec, scatter_leaf_vec, INSERT_VALUES, SCATTER_FORWARD)); - PetscCallVoid(VecScatterEnd(mat_mpi->Mvctx, scatter_root_vec, scatter_leaf_vec, INSERT_VALUES, SCATTER_FORWARD)); - { - ConstPetscScalarKokkosView leaf_scalar_d; - PetscCallVoid(VecGetKokkosView(scatter_leaf_vec, &leaf_scalar_d)); - Kokkos::parallel_for( - Kokkos::RangePolicy<>(exec, 0, cols_ao), KOKKOS_LAMBDA(PetscInt i) { - cf_markers_nonlocal_d(i) = (int)leaf_scalar_d(i); - }); - PetscCallVoid(VecRestoreKokkosView(scatter_leaf_vec, &leaf_scalar_d)); - } + Kokkos::fence(); // We use the veto arrays here to do this comms Kokkos::deep_copy(exec, veto_nonlocal_d, false); Kokkos::deep_copy(exec, veto_local_d, false); + Kokkos::fence(); + // Set non-local strong dependencies Kokkos::parallel_for( Kokkos::TeamPolicy<>(exec, local_rows, Kokkos::AUTO()), @@ -1062,6 +1075,8 @@ PETSC_INTERN void pmisr_existing_measure_implicit_transpose_kokkos(Mat *strength } }); + Kokkos::fence(); + // Reduce the veto_nonlocal_d with a lor via VecScatter ADD_VALUES SCATTER_REVERSE // Any local node with veto set to true is not in the set { @@ -1083,7 +1098,7 @@ PETSC_INTERN void pmisr_existing_measure_implicit_transpose_kokkos(Mat *strength PetscCallVoid(VecRestoreKokkosViewWrite(scatter_root_vec, &root_scalar_d)); } // Ensure send/receive buffers are stable before Begin. - Kokkos::fence(); + Kokkos::fence(); PetscCallVoid(VecScatterBegin(mat_mpi->Mvctx, scatter_leaf_vec, scatter_root_vec, ADD_VALUES, SCATTER_REVERSE)); PetscCallVoid(VecScatterEnd(mat_mpi->Mvctx, scatter_leaf_vec, scatter_root_vec, ADD_VALUES, SCATTER_REVERSE)); { @@ -1096,6 +1111,8 @@ PETSC_INTERN void pmisr_existing_measure_implicit_transpose_kokkos(Mat *strength PetscCallVoid(VecRestoreKokkosView(scatter_root_vec, &root_scalar_d)); } + Kokkos::fence(); + // Let's finish the non-local dependencies // If this node has been veto'd, then set it to not in the set Kokkos::parallel_for( @@ -1105,6 +1122,37 @@ PETSC_INTERN void pmisr_existing_measure_implicit_transpose_kokkos(Mat *strength } }); + Kokkos::fence(); + + // Now that non-local dependencies are marked, broadcast the cf_markers so that + // on other ranks we know which nodes have cf_markers_nonlocal_d(i) == loops_through + // (i.e. which nonlocal nodes were assigned to the IS this iteration). + // This matches the Fortran ordering: reverse scatter first, then forward scatter. + { + PetscScalarKokkosView root_scalar_d; + PetscCallVoid(VecGetKokkosViewWrite(scatter_root_vec, &root_scalar_d)); + Kokkos::parallel_for( + Kokkos::RangePolicy<>(exec, 0, local_rows), KOKKOS_LAMBDA(PetscInt i) { + root_scalar_d(i) = (PetscScalar)cf_markers_d(i); + }); + PetscCallVoid(VecRestoreKokkosViewWrite(scatter_root_vec, &root_scalar_d)); + } + // Ensure send/receive buffers are stable before Begin. + Kokkos::fence(); + PetscCallVoid(VecScatterBegin(mat_mpi->Mvctx, scatter_root_vec, scatter_leaf_vec, INSERT_VALUES, SCATTER_FORWARD)); + PetscCallVoid(VecScatterEnd(mat_mpi->Mvctx, scatter_root_vec, scatter_leaf_vec, INSERT_VALUES, SCATTER_FORWARD)); + { + ConstPetscScalarKokkosView leaf_scalar_d; + PetscCallVoid(VecGetKokkosView(scatter_leaf_vec, &leaf_scalar_d)); + Kokkos::parallel_for( + Kokkos::RangePolicy<>(exec, 0, cols_ao), KOKKOS_LAMBDA(PetscInt i) { + cf_markers_nonlocal_d(i) = (int)leaf_scalar_d(i); + }); + PetscCallVoid(VecRestoreKokkosView(scatter_leaf_vec, &leaf_scalar_d)); + } + + Kokkos::fence(); + // And now we have the information we need to set any of the non-local influences if (mat_nonlocal_transpose != NULL) { @@ -1125,13 +1173,14 @@ PETSC_INTERN void pmisr_existing_measure_implicit_transpose_kokkos(Mat *strength Kokkos::TeamThreadRange(t, ncols_nonlocal), [&](const PetscInt j) { // Needs to be atomic as may being set by many threads - if (cf_markers_d(device_nonlocal_j_transpose[device_nonlocal_i_transpose[i] + j]) != 1) + if (Kokkos::atomic_load(&cf_markers_d(device_nonlocal_j_transpose[device_nonlocal_i_transpose[i] + j])) != 1) { Kokkos::atomic_store(&cf_markers_d(device_nonlocal_j_transpose[device_nonlocal_i_transpose[i] + j]), 1); } }); } }); + Kokkos::fence(); } } else @@ -1146,7 +1195,7 @@ PETSC_INTERN void pmisr_existing_measure_implicit_transpose_kokkos(Mat *strength const PetscInt i = t.league_rank(); // Check if this node has been assigned during this top loop - if (cf_markers_d(i) == loops_through) + if (Kokkos::atomic_load(&cf_markers_d(i)) == loops_through) { // Do the strong dependencies and influences PetscInt ncols_local = device_local_i_spst[i + 1] - device_local_i_spst[i]; @@ -1158,13 +1207,14 @@ PETSC_INTERN void pmisr_existing_measure_implicit_transpose_kokkos(Mat *strength // Skip the diagonal - we don't want to mark ourselves as a neighbor // Needs to be atomic as may being set by many threads - if (cf_markers_d(col) != 1 && col != i) + if (Kokkos::atomic_load(&cf_markers_d(col)) != 1 && col != i) { Kokkos::atomic_store(&cf_markers_d(col), 1); } }); } }); + Kokkos::fence(); } // We've done another top level loop @@ -1189,6 +1239,8 @@ PETSC_INTERN void pmisr_existing_measure_implicit_transpose_kokkos(Mat *strength Kokkos::fence(); } + Kokkos::fence(); + } while (counter_undecided != 0); @@ -1200,6 +1252,8 @@ PETSC_INTERN void pmisr_existing_measure_implicit_transpose_kokkos(Mat *strength if (destroy_spst) PetscCallVoid(MatDestroy(&mat_local_spst)); if (destroy_nonlocal_transpose) PetscCallVoid(MatDestroy(&mat_nonlocal_transpose)); + Kokkos::fence(); + // ~~~~~~~~~ // Now assign our final cf markers // ~~~~~~~~~ @@ -1233,6 +1287,7 @@ PETSC_INTERN void pmisr_existing_measure_implicit_transpose_kokkos(Mat *strength // You have to explicitly call copy_cf_markers_d2h(cf_markers_local) to do this PETSC_INTERN void pmisr_kokkos(Mat *strength_mat, const int max_luby_steps, const int pmis_int, PetscReal *measure_local, const int zero_measure_c_point_int) { + //PflareKokkosTrace _trace("pmisr_kokkos"); MPI_Comm MPI_COMM_MATRIX; PetscInt local_rows, local_cols, global_rows, global_cols; @@ -1263,6 +1318,7 @@ PETSC_INTERN void pmisr_kokkos(Mat *strength_mat, const int max_luby_steps, cons // ~~~~~~~~~~~~ // Get pointers to the i,j,vals on the device // ~~~~~~~~~~~~ + Kokkos::fence(); const PetscInt *device_local_i = nullptr, *device_local_j = nullptr, *device_nonlocal_i = nullptr, *device_nonlocal_j = nullptr; PetscMemType mtype; PetscScalar *device_local_vals = nullptr, *device_nonlocal_vals = nullptr; @@ -1281,6 +1337,8 @@ PETSC_INTERN void pmisr_kokkos(Mat *strength_mat, const int max_luby_steps, cons auto exec = PetscGetKokkosExecutionSpace(); + Kokkos::fence(); + // If you want to generate the randoms on the device //Kokkos::Random_XorShift64_Pool<> random_pool(/*seed=*/12345); // Copy the input measure from host to device @@ -1289,6 +1347,8 @@ PETSC_INTERN void pmisr_kokkos(Mat *strength_mat, const int max_luby_steps, cons size_t bytes = measure_local_h.extent(0) * sizeof(PetscReal); PetscCallVoid(PetscLogCpuToGpu(bytes)); + Kokkos::fence(); + // Compute the measure Kokkos::parallel_for( Kokkos::RangePolicy<>(exec, 0, local_rows), KOKKOS_LAMBDA(PetscInt i) { @@ -1309,10 +1369,14 @@ PETSC_INTERN void pmisr_kokkos(Mat *strength_mat, const int max_luby_steps, cons // Flip the sign if pmis if (pmis_int == 1) measure_local_d(i) *= -1; }); + Kokkos::fence(); // Call the existing measure cf markers function pmisr_existing_measure_cf_markers_kokkos(strength_mat, max_luby_steps, pmis_int, measure_local_d, cf_markers_d, zero_measure_c_point_int); + // Sanity check: every local point must be marked F(-1) or C(1) + //check_cf_markers_all_marked_kokkos(cf_markers_d, local_rows, MPI_COMM_MATRIX); + // If PMIS then we swap the CF markers from PMISR if (pmis_int) { Kokkos::parallel_for( diff --git a/src/SAI_Zk.kokkos.cxx b/src/SAI_Zk.kokkos.cxx index 7334b886..d9da1994 100644 --- a/src/SAI_Zk.kokkos.cxx +++ b/src/SAI_Zk.kokkos.cxx @@ -13,6 +13,7 @@ PETSC_INTERN void calculate_and_build_sai_z_kokkos(Mat *A_ff, Mat *A_cf, Mat *sparsity_mat_cf, const int reuse_int_reuse_mat, Mat *reuse_mat, Mat *z_mat) { + //PflareKokkosTrace _trace("calculate_and_build_sai_z_kokkos"); MPI_Comm MPI_COMM_MATRIX; PetscInt local_rows_cf, local_cols_cf; PetscInt local_rows_ff, local_cols_ff; @@ -24,6 +25,9 @@ PETSC_INTERN void calculate_and_build_sai_z_kokkos(Mat *A_ff, Mat *A_cf, Mat *sp PetscInt one = 1; bool deallocate_submatrices = false; + mat_sync(A_ff); + mat_sync(A_cf); + PetscCallVoid(MatGetType(*A_ff, &mat_type)); // Are we in parallel? const bool mpi = strcmp(mat_type, MATMPIAIJKOKKOS) == 0; @@ -207,6 +211,7 @@ PETSC_INTERN void calculate_and_build_sai_z_kokkos(Mat *A_ff, Mat *A_cf, Mat *sp // ~~~~~~~~~~~~~~ // Get device CSR pointers for all matrices // ~~~~~~~~~~~~~~ + Kokkos::fence(); PetscMemType mtype; // Submatrix (non-local rows of A_ff) diff --git a/src/VecISCopyLocalk.kokkos.cxx b/src/VecISCopyLocalk.kokkos.cxx index f5b1ff74..32bfa24a 100644 --- a/src/VecISCopyLocalk.kokkos.cxx +++ b/src/VecISCopyLocalk.kokkos.cxx @@ -13,6 +13,7 @@ int max_levels = -1; // Destroys the data PETSC_INTERN void destroy_VecISCopyLocal_kokkos() { + //PflareKokkosTrace _trace("destroy_VecISCopyLocal_kokkos"); if (IS_fine_views_local) { // Will automatically call the destructor on each element delete[] IS_fine_views_local; @@ -31,6 +32,7 @@ PETSC_INTERN void destroy_VecISCopyLocal_kokkos() // Creates the data we need to do the equivalent of veciscopy on local data in kokkos PETSC_INTERN void create_VecISCopyLocal_kokkos(int max_levels_input) { + //PflareKokkosTrace _trace("create_VecISCopyLocal_kokkos"); // If not built if (!IS_fine_views_local) { @@ -65,6 +67,7 @@ PETSC_INTERN void create_VecISCopyLocal_kokkos(int max_levels_input) // Copy the input IS's to the device for our_level PETSC_INTERN void set_VecISCopyLocal_kokkos_our_level(int our_level, PetscInt global_row_start, IS *index_fine, IS *index_coarse) { + //PflareKokkosTrace _trace("set_VecISCopyLocal_kokkos_our_level"); auto exec = PetscGetKokkosExecutionSpace(); const int level_idx = our_level - 1; @@ -83,6 +86,9 @@ PETSC_INTERN void set_VecISCopyLocal_kokkos_our_level(int our_level, PetscInt gl IS_fine_views_local[level_idx] = std::make_shared("IS_fine_view_" + std::to_string(our_level), fine_local_size); // Copy the indices over to the device Kokkos::deep_copy(exec, *IS_fine_views_local[level_idx], fine_view_h); + // The source pointer is owned by ISGetIndices; make sure copy completed + // before restoring that host buffer. + Kokkos::fence(); // Log copy with petsc size_t bytes = fine_view_h.extent(0) * sizeof(PetscInt); PetscCallVoid(PetscLogCpuToGpu(bytes)); @@ -102,6 +108,9 @@ PETSC_INTERN void set_VecISCopyLocal_kokkos_our_level(int our_level, PetscInt gl IS_coarse_views_local[level_idx] = std::make_shared("IS_coarse_view_" + std::to_string(our_level), coarse_local_size); // Copy the indices over to the device Kokkos::deep_copy(exec, *IS_coarse_views_local[level_idx], coarse_view_h); + // The source pointer is owned by ISGetIndices; make sure copy completed + // before restoring that host buffer. + Kokkos::fence(); // Log copy with petsc bytes = coarse_view_h.extent(0) * sizeof(PetscInt); PetscCallVoid(PetscLogCpuToGpu(bytes)); @@ -124,6 +133,8 @@ PETSC_INTERN void set_VecISCopyLocal_kokkos_our_level(int our_level, PetscInt gl // Do the equivalent of veciscopy on local data using the IS data on the device PETSC_INTERN void VecISCopyLocal_kokkos(int our_level, int fine_int, Vec *vfull, int mode_int, Vec *vreduced) { + //PflareKokkosTrace _trace("VecISCopyLocal_kokkos"); + Kokkos::fence(); const int level_idx = our_level - 1; // Can't use the shared pointer directly within the parallel