Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
60 commits
Select commit Hold shift + click to select a range
89d9e95
Add a fence before every MatSeqAIJGetCSRAndMemType call
stevendargaville Mar 31, 2026
bd7cb89
Ensure fences are called before MatCreateSeqAIJKokkosWithKokkosViews
stevendargaville Mar 31, 2026
cc6e947
Fence the sorting more carefully
stevendargaville Apr 1, 2026
38390e4
Add fence after unique_copy
stevendargaville Apr 1, 2026
edf2fd8
Fence before off-diagonal MatCreateSubMatrix_Seq_kokkos call
stevendargaville Apr 1, 2026
90d3ed9
Make scatters synchronous
stevendargaville Apr 1, 2026
8e6e36b
Add fence before ISCopyLocal
stevendargaville Apr 1, 2026
2d3ed11
Ensure sort_crs_matrix and spadd in Kokkos are only performed on matr…
stevendargaville Apr 2, 2026
93bc84c
Extra fence before spadd
stevendargaville Apr 2, 2026
91d436e
Only use lvec as leaf vector in scatters
stevendargaville Apr 3, 2026
3ead627
Sync every matrix before use in Kokkos
stevendargaville Apr 3, 2026
300bebb
Add tracers
stevendargaville Apr 3, 2026
a822ad5
Write out to stderr
stevendargaville Apr 3, 2026
6bcd65b
Add validation checks in pmisr
stevendargaville Apr 3, 2026
da621a6
Add more printing inside pmisr
stevendargaville Apr 3, 2026
1f97f07
remove extra fence in print
stevendargaville Apr 3, 2026
3b50ce9
Extra debugging
stevendargaville Apr 3, 2026
3d7766e
Extra writes
stevendargaville Apr 3, 2026
aee4f21
Fence between start/end
stevendargaville Apr 3, 2026
333019c
extra fence
stevendargaville Apr 3, 2026
9466d26
print sf details
stevendargaville Apr 3, 2026
8b36f17
print out lvec kokkos data pointer
stevendargaville Apr 3, 2026
b262c53
Replace lvec use
stevendargaville Apr 4, 2026
2156999
Add exec and fence to d2h copies
stevendargaville Apr 4, 2026
51fa939
Add more print staetments
stevendargaville Apr 4, 2026
b845b5b
Add arnoldi prints
stevendargaville Apr 6, 2026
f86e5fe
Ensure reads are atomic
stevendargaville Apr 6, 2026
9d6b754
Move scatter
stevendargaville Apr 6, 2026
d557cc6
Bounds checking on cf splitting
stevendargaville Apr 6, 2026
8a942f2
Ensure fences before vecdestroy
stevendargaville Apr 7, 2026
8685b24
Remove scoping
stevendargaville Apr 7, 2026
b6ccbe0
More text output
stevendargaville Apr 7, 2026
32dffcb
Extra scatter print
stevendargaville Apr 7, 2026
67f48f5
Extract acf afc print
stevendargaville Apr 7, 2026
6d4ea50
Replace acf matcreatesubmatrix
stevendargaville Apr 7, 2026
cd44845
Disable kokkos MatCreateSubMatrix
stevendargaville Apr 7, 2026
21dbc46
Don't use device copies in kokkos submatrix
stevendargaville Apr 7, 2026
c3c4da7
Add more fences around restoreindices
stevendargaville Apr 8, 2026
6f07ad6
Hypothesis testing
stevendargaville Apr 8, 2026
fd33a7a
More bounds checking
stevendargaville Apr 8, 2026
76ed5c3
Disable local seq create matrix in kokkos
stevendargaville Apr 8, 2026
944d18b
Replacement
stevendargaville Apr 8, 2026
e66eb87
Test cpu only in kokkos
stevendargaville Apr 8, 2026
afa76fd
Extra assert
stevendargaville Apr 8, 2026
dd941a6
Reenable
stevendargaville Apr 8, 2026
5f0e04b
C version of petsc cpu
stevendargaville Apr 9, 2026
3564898
IS check
stevendargaville Apr 9, 2026
81fa70b
Move cpu call
stevendargaville Apr 9, 2026
fec7243
Change location
stevendargaville Apr 9, 2026
84c292e
Remove extra
stevendargaville Apr 9, 2026
2170ed3
Comment out useless
stevendargaville Apr 9, 2026
0e34228
Check everything disabled
stevendargaville Apr 9, 2026
f102988
Pass by copy
stevendargaville Apr 9, 2026
13e1fd7
Remove kokkos code
stevendargaville Apr 9, 2026
4a27531
Allow more of the kokkos 1
stevendargaville Apr 10, 2026
7e86863
Keep our_level minus one path
stevendargaville Apr 10, 2026
5e4c812
More fences around cf splitting
stevendargaville Apr 10, 2026
7629903
Disable prints
stevendargaville Apr 10, 2026
367c465
Disable more printing
stevendargaville Apr 10, 2026
e4607f5
Reenable gpu kokkos createsubmatrix
stevendargaville Apr 11, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
118 changes: 118 additions & 0 deletions include/kokkos_helper.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,23 @@
#include <KokkosSparse_spadd.hpp>
#include <Kokkos_NestedSort.hpp>
#include <KokkosBatched_Gesv.hpp>
#include <cstdio>

struct PflareKokkosTrace {
const char *name;
PflareKokkosTrace(const char *n) : name(n) {
int rank = 0;
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
fprintf(stderr, "[PFLARE kokkos rank=%d] Entering %s\n", rank, name);
fflush(stderr);
}
~PflareKokkosTrace() {
int rank = 0;
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
fprintf(stderr, "[PFLARE kokkos rank=%d] Leaving %s\n", rank, name);
fflush(stderr);
}
};

using DefaultExecutionSpace = Kokkos::DefaultExecutionSpace;
using DefaultMemorySpace = Kokkos::DefaultExecutionSpace::memory_space;
Expand All @@ -34,6 +51,7 @@ using Scratch2DScalarView = Kokkos::View<PetscScalar**, ScratchSpace, Kokkos::Me
using ViewPetscIntPtr = std::shared_ptr<PetscIntKokkosView>;

PETSC_INTERN void mat_duplicate_copy_plus_diag_kokkos(Mat *, int, Mat *);
PETSC_INTERN void mat_sync(Mat *);
PETSC_INTERN void rewrite_j_global_to_local(PetscInt, PetscInt&, PetscIntKokkosView, PetscInt**);
PETSC_INTERN void create_cf_is_device_kokkos(Mat *input_mat, const int match_cf, PetscIntKokkosView &is_local_d);
PETSC_INTERN void pmisr_existing_measure_cf_markers_kokkos(Mat *strength_mat, const int max_luby_steps, const int pmis_int, PetscScalarKokkosView &measure_local_d, intKokkosView &cf_markers_d, const int zero_measure_c_point_int);
Expand Down Expand Up @@ -146,4 +164,104 @@ PetscInt binary_search_sorted(const ViewType &sorted_view, const PetscInt size,
return -1;
}

// Check that every entry in cf_markers_d is either -1 (F) or 1 (C).
// Calls MPI_Abort if any local point is not marked.
inline void check_cf_markers_all_marked_kokkos(
const intKokkosView &cf_markers_d,
const PetscInt local_rows,
MPI_Comm MPI_COMM_MATRIX)
{
auto exec = PetscGetKokkosExecutionSpace();
PetscInt bad_count = 0;
Kokkos::parallel_reduce(
"check_cf_markers",
Kokkos::RangePolicy<>(exec, 0, local_rows),
KOKKOS_LAMBDA(const PetscInt i, PetscInt &count) {
if (cf_markers_d(i) != -1 && cf_markers_d(i) != 1) count++;
}, bad_count);
Kokkos::fence();
int rank = 0;
MPI_Comm_rank(MPI_COMM_MATRIX, &rank);
if (bad_count > 0) {
fprintf(stderr,
"[PFLARE kokkos rank=%d] ERROR check_cf_markers_all_marked_kokkos: "
"%d / %d local points are NOT marked F or C\n",
rank, (int)bad_count, (int)local_rows);
fflush(stderr);
MPI_Abort(MPI_COMM_MATRIX, 1);
}
// else {
// fprintf(stderr,
// "[PFLARE kokkos rank=%d] check_cf_markers_all_marked_kokkos: "
// "all %d local points marked F or C OK\n",
// rank, (int)local_rows);
// fflush(stderr);
// }
}

// Check that is_fine_local_d and is_coarse_local_d together cover every local
// point [0, local_rows-1] exactly once (no missing, no duplicates).
// Call before global-index conversion (entries are local offsets [0, local_rows-1]).
// Calls MPI_Abort if any point is missing or duplicated.
inline void check_cf_is_all_local_kokkos(
const PetscIntKokkosView &is_fine_local_d,
const PetscIntKokkosView &is_coarse_local_d,
const PetscInt local_rows,
MPI_Comm MPI_COMM_MATRIX)
{
auto exec = PetscGetKokkosExecutionSpace();
int rank = 0;
MPI_Comm_rank(MPI_COMM_MATRIX, &rank);

// Allocate hit-count array, initialised to 0
intKokkosView hit_count("hit_count", local_rows);
Kokkos::deep_copy(exec, hit_count, 0);

// Mark each fine index (atomic to catch duplicates within the fine set)
Kokkos::parallel_for(
"check_cf_is_mark_fine",
Kokkos::RangePolicy<>(exec, 0, (PetscInt)is_fine_local_d.extent(0)),
KOKKOS_LAMBDA(const PetscInt i) {
const PetscInt idx = is_fine_local_d(i);
if (idx >= 0 && idx < local_rows)
Kokkos::atomic_add(&hit_count(idx), 1);
});

// Mark each coarse index
Kokkos::parallel_for(
"check_cf_is_mark_coarse",
Kokkos::RangePolicy<>(exec, 0, (PetscInt)is_coarse_local_d.extent(0)),
KOKKOS_LAMBDA(const PetscInt i) {
const PetscInt idx = is_coarse_local_d(i);
if (idx >= 0 && idx < local_rows)
Kokkos::atomic_add(&hit_count(idx), 1);
});

// Count any point not hit exactly once
PetscInt bad_count = 0;
Kokkos::parallel_reduce(
"check_cf_is_count_bad",
Kokkos::RangePolicy<>(exec, 0, local_rows),
KOKKOS_LAMBDA(const PetscInt i, PetscInt &count) {
if (hit_count(i) != 1) count++;
}, bad_count);

Kokkos::fence();

if (bad_count > 0) {
fprintf(stderr,
"[PFLARE kokkos rank=%d] ERROR check_cf_is_all_local_kokkos: "
"%d / %d local points are not covered exactly once by fine+coarse IS\n",
rank, (int)bad_count, (int)local_rows);
fflush(stderr);
MPI_Abort(MPI_COMM_MATRIX, 1);
} else {
fprintf(stderr,
"[PFLARE kokkos rank=%d] check_cf_is_all_local_kokkos: "
"fine=%d coarse=%d, all %d local points covered exactly once OK\n",
rank, (int)is_fine_local_d.extent(0), (int)is_coarse_local_d.extent(0), (int)local_rows);
fflush(stderr);
}
}

#endif
14 changes: 13 additions & 1 deletion src/AIR_MG_Setup.F90
Original file line number Diff line number Diff line change
Expand Up @@ -155,7 +155,9 @@ subroutine setup_air_pcmg(amat, pmat, air_data, pcmg_input)
! We already know how many coarse levels we have if we are re-using
if (.NOT. air_data%allocated_matrices_A_ff(our_level) .AND. &
our_level .ge. air_data%options%auto_truncate_start_level .AND. &
air_data%options%auto_truncate_start_level /= -1) then
air_data%options%auto_truncate_start_level /= -1) then

!print *, "starting auto truncate check on level ", our_level

call timer_start(TIMER_ID_AIR_TRUNCATE)

Expand All @@ -168,6 +170,8 @@ subroutine setup_air_pcmg(amat, pmat, air_data, pcmg_input)
proc_stride, &
air_data%inv_coarsest_poly_data)

!print *, "starting approx inverse ", our_level

! Start the approximate inverse we'll use on this level
call start_approximate_inverse(air_data%coarse_matrix(our_level), &
air_data%inv_coarsest_poly_data%inverse_type, &
Expand All @@ -189,6 +193,8 @@ subroutine setup_air_pcmg(amat, pmat, air_data, pcmg_input)
call VecDuplicate(rand_vec, sol_vec, ierr)
call VecDuplicate(rand_vec, temp_vec, ierr)

!print *, "starting finish approx inverse ", our_level

! Finish our approximate inverse
call finish_approximate_inverse(air_data%coarse_matrix(our_level), &
air_data%inv_coarsest_poly_data%inverse_type, &
Expand All @@ -209,6 +215,8 @@ subroutine setup_air_pcmg(amat, pmat, air_data, pcmg_input)
air_data%inv_coarsest_poly_data%inverse_type == PFLAREINV_NEWTON_NO_EXTRA) .AND. &
air_data%options%coarsest_matrix_free_polys) then

!print *, "starting matvecs residual ", our_level

if (air_data%options%coarsest_diag_scale_polys) then
call petsc_matvec_right_scale_poly_newton_residual_mf(air_data%inv_A_ff(our_level), rand_vec, temp_vec)
else
Expand All @@ -225,6 +233,8 @@ subroutine setup_air_pcmg(amat, pmat, air_data, pcmg_input)
call VecAXPY(temp_vec, -1d0, rand_vec, ierr)
end if

!print *, "computing norms ", our_level

! Get the achieved norm
call VecNorm(temp_vec, NORM_2, achieved_rel_tol, ierr)

Expand Down Expand Up @@ -253,6 +263,8 @@ subroutine setup_air_pcmg(amat, pmat, air_data, pcmg_input)
call timer_finish(TIMER_ID_AIR_TRUNCATE)
end if

!print *, "starting cf splitting ", our_level

! ~~~~~~~~~~~~
! Compute the coarsening
! ~~~~~~~~~~~~
Expand Down
36 changes: 27 additions & 9 deletions src/AIR_Operators_Setup.F90
Original file line number Diff line number Diff line change
Expand Up @@ -191,30 +191,48 @@ subroutine get_submatrices_start_poly_coeff_comms(input_mat, our_level, air_data
! ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
! Pull out the rest of the sub-matrices
! ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
call timer_start(TIMER_ID_AIR_EXTRACT)
call timer_start(TIMER_ID_AIR_EXTRACT)

!print *, "extract afc acf start"

! Only reuse when coarse matrix structure is stable (amount>=2 stores MAT_RAP_DROP)
if (air_data%allocated_matrices_A_ff(our_level) .AND. &
air_data%options%reuse_sparsity .AND. &
REUSE_MAT_ACTIVE(MAT_RAP_DROP, air_data%options%reuse_amount)) then
call MatCreateSubMatrixWrapper(input_mat, &
air_data%IS_fine_index(our_level), air_data%IS_coarse_index(our_level), MAT_REUSE_MATRIX, &
air_data%A_fc(our_level), &
our_level = our_level, is_row_fine = .TRUE., is_col_fine = .FALSE.)
air_data%A_fc(our_level))
call MatCreateSubMatrixWrapper(input_mat, &
air_data%IS_coarse_index(our_level), air_data%IS_fine_index(our_level), MAT_REUSE_MATRIX, &
air_data%A_cf(our_level), &
our_level = our_level, is_row_fine = .FALSE., is_col_fine = .TRUE.)
air_data%A_cf(our_level))
else
call MatCreateSubMatrixWrapper(input_mat, &
air_data%IS_fine_index(our_level), air_data%IS_coarse_index(our_level), MAT_INITIAL_MATRIX, &
air_data%A_fc(our_level), &
our_level = our_level, is_row_fine = .TRUE., is_col_fine = .FALSE.)
air_data%A_fc(our_level))
call MatCreateSubMatrixWrapper(input_mat, &
air_data%IS_coarse_index(our_level), air_data%IS_fine_index(our_level), MAT_INITIAL_MATRIX, &
air_data%A_cf(our_level), &
our_level = our_level, is_row_fine = .FALSE., is_col_fine = .TRUE.)
air_data%A_cf(our_level))
end if
! Only reuse when coarse matrix structure is stable (amount>=2 stores MAT_RAP_DROP)
! if (air_data%allocated_matrices_A_ff(our_level) .AND. &
! air_data%options%reuse_sparsity .AND. &
! REUSE_MAT_ACTIVE(MAT_RAP_DROP, air_data%options%reuse_amount)) then
! call MatCreateSubMatrix(input_mat, &
! air_data%IS_fine_index(our_level), air_data%IS_coarse_index(our_level), MAT_REUSE_MATRIX, &
! air_data%A_fc(our_level), ierr)
! call MatCreateSubMatrix(input_mat, &
! air_data%IS_coarse_index(our_level), air_data%IS_fine_index(our_level), MAT_REUSE_MATRIX, &
! air_data%A_cf(our_level),ierr)
! else
! call MatCreateSubMatrix(input_mat, &
! air_data%IS_fine_index(our_level), air_data%IS_coarse_index(our_level), MAT_INITIAL_MATRIX, &
! air_data%A_fc(our_level), ierr)
! call MatCreateSubMatrix(input_mat, &
! air_data%IS_coarse_index(our_level), air_data%IS_fine_index(our_level), MAT_INITIAL_MATRIX, &
! air_data%A_cf(our_level), ierr)
! end if

!print *, "extract afc acf done"

call timer_finish(TIMER_ID_AIR_EXTRACT)

Expand Down
19 changes: 18 additions & 1 deletion src/DDC_Modulek.kokkos.cxx
Original file line number Diff line number Diff line change
Expand Up @@ -15,11 +15,16 @@
// You have to explicitly call copy_cf_markers_d2h(cf_markers_local) to do this
PETSC_INTERN void ddc_kokkos(Mat *input_mat, const PetscReal fraction_swap, const PetscReal max_dd_ratio, const PetscReal max_dd_ratio_achieved, Mat *aff, PetscReal *random_numbers)
{
// Can't use the global directly within the parallel
//PflareKokkosTrace _trace("ddc_kokkos");
// Can't use the global directly within the parallel
// regions on the device
intKokkosView cf_markers_d = cf_markers_local_d;
PetscScalarKokkosView diag_dom_ratio_d = diag_dom_ratio_local_d;
PetscIntKokkosView is_fine_local_d;
// Equivalent to calling MatSeqAIJKokkosSyncDevice which is petsc intern
mat_sync(input_mat);
MPI_Comm MPI_COMM_MATRIX;
PetscCallVoid(PetscObjectGetComm((PetscObject)*input_mat, &MPI_COMM_MATRIX));

const int match_cf = -1; // F_POINT == -1
create_cf_is_device_kokkos(input_mat, match_cf, is_fine_local_d);
Expand Down Expand Up @@ -64,11 +69,15 @@ PETSC_INTERN void ddc_kokkos(Mat *input_mat, const PetscReal fraction_swap, cons
// recompute
// ~~~~~~~~~~~~~~~
{
Kokkos::fence();

// Create measure and cf_markers for Aff
PetscScalarKokkosView measure_d("measure_d", local_rows_aff);
intKokkosView cf_markers_aff_d("cf_markers_aff_d", local_rows_aff);
Kokkos::deep_copy(exec, cf_markers_aff_d, 0);

Kokkos::fence();

// Copy the random numbers from host to device
// These are generated in the Fortran wrapper so CPU and Kokkos use the same randoms
PetscScalarKokkosViewHost random_h(random_numbers, local_rows_aff);
Expand All @@ -79,6 +88,8 @@ PETSC_INTERN void ddc_kokkos(Mat *input_mat, const PetscReal fraction_swap, cons
const PetscReal max_scale = std::max(10.0, max_dd_ratio_achieved * 2.0);
const PetscReal target_ratio = max_dd_ratio;

Kokkos::fence();

// Build the measure:
// pmisr_existing_measure_cf_markers tags the smallest measure as F points
// So we feed in measure = max(10, max_achieved*2) - (diag_dom_ratio - random/1e10)
Expand All @@ -103,6 +114,10 @@ PETSC_INTERN void ddc_kokkos(Mat *input_mat, const PetscReal fraction_swap, cons
// pmis_int=0 means PMISR, zero_measure_c_point_int=0
pmisr_existing_measure_implicit_transpose_kokkos(aff, -1, 0, measure_d, cf_markers_aff_d, 0);

//check_cf_markers_all_marked_kokkos(cf_markers_aff_d, cf_markers_aff_d.extent(0), MPI_COMM_MATRIX);

Kokkos::fence();

// Swap F-tagged points back into cf_markers_d
Kokkos::parallel_for(
Kokkos::RangePolicy<>(exec, 0, local_rows_aff), KOKKOS_LAMBDA(PetscInt i) {
Expand All @@ -112,6 +127,8 @@ PETSC_INTERN void ddc_kokkos(Mat *input_mat, const PetscReal fraction_swap, cons
}
});
Kokkos::fence();

//check_cf_markers_all_marked_kokkos(cf_markers_d, cf_markers_d.extent(0), MPI_COMM_MATRIX);
}
return;
}
Expand Down
Loading
Loading