Skip to content

Commit 5d38a59

Browse files
Merge pull request #105 from PFLAREProject/pmisr_kokkos_optim
Pmisr kokkos optim
2 parents a2bd617 + 30ee5b2 commit 5d38a59

File tree

1 file changed

+78
-70
lines changed

1 file changed

+78
-70
lines changed

src/PMISR_DDCk.kokkos.cxx

Lines changed: 78 additions & 70 deletions
Original file line numberDiff line numberDiff line change
@@ -88,23 +88,26 @@ PETSC_INTERN void pmisr_kokkos(Mat *strength_mat, const int max_luby_steps, cons
8888

8989
// Device memory for the global variable cf_markers_local_d - be careful these aren't petsc ints
9090
cf_markers_local_d = intKokkosView("cf_markers_local_d", local_rows);
91-
PetscScalar *cf_markers_local_real_d_ptr = NULL, *cf_markers_nonlocal_real_d_ptr = NULL;
92-
// The real equivalents so we can use the existing petscsf from the input matrix
93-
PetscScalarKokkosView cf_markers_local_real_d("cf_markers_local_real_d", local_rows);
94-
PetscScalarKokkosView cf_markers_nonlocal_real_d;
95-
cf_markers_local_real_d_ptr = cf_markers_local_real_d.data();
91+
// Can't use the global directly within the parallel
92+
// regions on the device so just take a shallow copy
93+
intKokkosView cf_markers_d = cf_markers_local_d;
94+
95+
intKokkosView cf_markers_nonlocal_d;
96+
int *cf_markers_d_ptr = NULL, *cf_markers_nonlocal_d_ptr = NULL;
97+
cf_markers_d_ptr = cf_markers_d.data();
9698

9799
// Host and device memory for the measure
98100
PetscScalarKokkosViewHost measure_local_h(measure_local, local_rows);
99101
PetscScalarKokkosView measure_local_d("measure_local_d", local_rows);
100102
PetscScalar *measure_local_d_ptr = NULL, *measure_nonlocal_d_ptr = NULL;
101103
measure_local_d_ptr = measure_local_d.data();
102104
PetscScalarKokkosView measure_nonlocal_d;
105+
103106
if (mpi) {
104107
measure_nonlocal_d = PetscScalarKokkosView("measure_nonlocal_d", cols_ao);
105108
measure_nonlocal_d_ptr = measure_nonlocal_d.data();
106-
cf_markers_nonlocal_real_d = PetscScalarKokkosView("cf_markers_nonlocal_real_d", cols_ao);
107-
cf_markers_nonlocal_real_d_ptr = cf_markers_nonlocal_real_d.data();
109+
cf_markers_nonlocal_d = intKokkosView("cf_markers_nonlocal_d", cols_ao);
110+
cf_markers_nonlocal_d_ptr = cf_markers_nonlocal_d.data();
108111
}
109112

110113
// Device memory for the mark
@@ -153,43 +156,39 @@ PETSC_INTERN void pmisr_kokkos(Mat *strength_mat, const int max_luby_steps, cons
153156

154157
// Initialise the set
155158
PetscInt counter_in_set_start = 0;
156-
// Count how many in the set to begin with
159+
// Count how many in the set to begin with and set their CF markers
157160
Kokkos::parallel_reduce ("Reduction", local_rows, KOKKOS_LAMBDA (const PetscInt i, PetscInt& update) {
158-
if (Kokkos::abs(measure_local_d[i]) < 1) update++;
159-
}, counter_in_set_start);
160-
161-
Kokkos::parallel_for(
162-
Kokkos::RangePolicy<>(0, local_rows), KOKKOS_LAMBDA(PetscInt i) {
163-
164-
if (Kokkos::abs(measure_local_d(i)) < 1)
161+
if (Kokkos::abs(measure_local_d[i]) < 1)
165162
{
166163
if (zero_measure_c_point_int == 1) {
167164
if (pmis_int == 1) {
168165
// Set as F here but reversed below to become C
169-
cf_markers_local_real_d(i) = -1;
166+
cf_markers_d(i) = -1;
170167
}
171168
else {
172169
// Becomes C
173-
cf_markers_local_real_d(i) = 1;
170+
cf_markers_d(i) = 1;
174171
}
175172
}
176173
else {
177174
if (pmis_int == 1) {
178175
// Set as C here but reversed below to become F
179176
// Otherwise dirichlet conditions persist down onto the coarsest grid
180-
cf_markers_local_real_d(i) = 1;
177+
cf_markers_d(i) = 1;
181178
}
182179
else {
183180
// Becomes F
184-
cf_markers_local_real_d(i) = -1;
181+
cf_markers_d(i) = -1;
185182
}
186-
}
183+
}
184+
// Count
185+
update++;
187186
}
188187
else
189188
{
190-
cf_markers_local_real_d(i) = 0;
191-
}
192-
});
189+
cf_markers_d(i) = 0;
190+
}
191+
}, counter_in_set_start);
193192

194193
// Check the total number of undecided in parallel
195194
PetscInt counter_undecided, counter_parallel;
@@ -238,24 +237,17 @@ PETSC_INTERN void pmisr_kokkos(Mat *strength_mat, const int max_luby_steps, cons
238237
// Start the async scatter of the nonlocal cf_markers
239238
// ~~~~~~~~~
240239
if (mpi) {
241-
// We can't overwrite any of the values in cf_markers_local_real_d while the forward scatter is still going
242-
PetscSFBcastWithMemTypeBegin(mat_mpi->Mvctx, MPIU_SCALAR,
243-
mem_type, cf_markers_local_real_d_ptr,
244-
mem_type, cf_markers_nonlocal_real_d_ptr,
240+
// We can't overwrite any of the values in cf_markers_d while the forward scatter is still going
241+
// Be careful these aren't petscints
242+
PetscSFBcastWithMemTypeBegin(mat_mpi->Mvctx, MPI_INT,
243+
mem_type, cf_markers_d_ptr,
244+
mem_type, cf_markers_nonlocal_d_ptr,
245245
MPI_REPLACE);
246246
}
247247

248-
// This keeps track of which of the candidate nodes can become in the set
248+
// mark_d keeps track of which of the candidate nodes can become in the set
249249
// Only need this because we want to do async comms so we need a way to trigger
250250
// a node not being in the set due to either strong local neighbours *or* strong offproc neighbours
251-
Kokkos::deep_copy(mark_d, true);
252-
253-
// Any that aren't zero cf marker are already assigned so set to to false
254-
Kokkos::parallel_for(
255-
Kokkos::RangePolicy<>(0, local_rows), KOKKOS_LAMBDA(PetscInt i) {
256-
257-
if (cf_markers_local_real_d(i) != 0) mark_d(i) = false;
258-
});
259251

260252
// ~~~~~~~~
261253
// Go and do the local component
@@ -269,9 +261,8 @@ PETSC_INTERN void pmisr_kokkos(Mat *strength_mat, const int max_luby_steps, cons
269261
PetscInt strong_neighbours = 0;
270262

271263
// Check this row isn't already marked
272-
if (cf_markers_local_real_d(i) == 0)
264+
if (cf_markers_d(i) == 0)
273265
{
274-
const PetscInt i = t.league_rank();
275266
const PetscInt ncols_local = device_local_i[i + 1] - device_local_i[i];
276267

277268
// Reduce over local columns to get the number of strong neighbours
@@ -281,7 +272,7 @@ PETSC_INTERN void pmisr_kokkos(Mat *strength_mat, const int max_luby_steps, cons
281272

282273
// Have to only check active strong neighbours
283274
if (measure_local_d(i) >= measure_local_d(device_local_j[device_local_i[i] + j]) && \
284-
cf_markers_local_real_d(device_local_j[device_local_i[i] + j]) == 0)
275+
cf_markers_d(device_local_j[device_local_i[i] + j]) == 0)
285276
{
286277
strong_count++;
287278
}
@@ -292,7 +283,22 @@ PETSC_INTERN void pmisr_kokkos(Mat *strength_mat, const int max_luby_steps, cons
292283
// Only want one thread in the team to write the result
293284
Kokkos::single(Kokkos::PerTeam(t), [&]() {
294285
// If we have any strong neighbours
295-
if (strong_neighbours > 0) mark_d(i) = false;
286+
if (strong_neighbours > 0)
287+
{
288+
mark_d(i) = false;
289+
}
290+
else
291+
{
292+
mark_d(i) = true;
293+
}
294+
});
295+
}
296+
// Any that aren't zero cf marker are already assigned so set to to false
297+
else
298+
{
299+
// Only want one thread in the team to write the result
300+
Kokkos::single(Kokkos::PerTeam(t), [&]() {
301+
mark_d(i) = false;
296302
});
297303
}
298304
});
@@ -303,7 +309,8 @@ PETSC_INTERN void pmisr_kokkos(Mat *strength_mat, const int max_luby_steps, cons
303309
if (mpi) {
304310

305311
// Finish the async scatter
306-
PetscSFBcastEnd(mat_mpi->Mvctx, MPIU_SCALAR, cf_markers_local_real_d_ptr, cf_markers_nonlocal_real_d_ptr, MPI_REPLACE);
312+
// Be careful these aren't petscints
313+
PetscSFBcastEnd(mat_mpi->Mvctx, MPI_INT, cf_markers_d_ptr, cf_markers_nonlocal_d_ptr, MPI_REPLACE);
307314

308315
Kokkos::parallel_for(
309316
Kokkos::TeamPolicy<>(PetscGetKokkosExecutionSpace(), local_rows, Kokkos::AUTO()),
@@ -314,9 +321,8 @@ PETSC_INTERN void pmisr_kokkos(Mat *strength_mat, const int max_luby_steps, cons
314321
PetscInt strong_neighbours = 0;
315322

316323
// Check this row isn't already marked
317-
if (cf_markers_local_real_d(i) == 0)
324+
if (mark_d(i))
318325
{
319-
const PetscInt i = t.league_rank();
320326
PetscInt ncols_nonlocal = device_nonlocal_i[i + 1] - device_nonlocal_i[i];
321327

322328
// Reduce over nonlocal columns to get the number of strong neighbours
@@ -325,7 +331,7 @@ PETSC_INTERN void pmisr_kokkos(Mat *strength_mat, const int max_luby_steps, cons
325331
[&](const PetscInt j, PetscInt& strong_count) {
326332

327333
if (measure_local_d(i) >= measure_nonlocal_d(device_nonlocal_j[device_nonlocal_i[i] + j]) && \
328-
cf_markers_nonlocal_real_d(device_nonlocal_j[device_nonlocal_i[i] + j]) == 0)
334+
cf_markers_nonlocal_d(device_nonlocal_j[device_nonlocal_i[i] + j]) == 0)
329335
{
330336
strong_count++;
331337
}
@@ -335,25 +341,28 @@ PETSC_INTERN void pmisr_kokkos(Mat *strength_mat, const int max_luby_steps, cons
335341

336342
// Only want one thread in the team to write the result
337343
Kokkos::single(Kokkos::PerTeam(t), [&]() {
338-
// If we have any strong neighbours
339-
if (strong_neighbours > 0) mark_d(i) = false;
344+
// If we don't have any strong neighbours
345+
if (strong_neighbours == 0) cf_markers_d(i) = loops_through;
340346
});
341347
}
342348
});
343349
}
350+
// This cf_markers_d(i) = loops_through happens above in the case of mpi, saves a kernel launch
351+
else
352+
{
353+
// The nodes that have mark equal to true have no strong active neighbours in the IS
354+
// hence they can be in the IS
355+
Kokkos::parallel_for(
356+
Kokkos::RangePolicy<>(0, local_rows), KOKKOS_LAMBDA(PetscInt i) {
344357

345-
// The nodes that have mark equal to true have no strong active neighbours in the IS
346-
// hence they can be in the IS
347-
Kokkos::parallel_for(
348-
Kokkos::RangePolicy<>(0, local_rows), KOKKOS_LAMBDA(PetscInt i) {
349-
350-
if (mark_d(i)) cf_markers_local_real_d(i) = double(loops_through);
351-
});
358+
if (mark_d(i)) cf_markers_d(i) = loops_through;
359+
});
360+
}
352361

353362
if (mpi)
354363
{
355364
// We're going to do an add reverse scatter, so set them to zero
356-
Kokkos::deep_copy(cf_markers_nonlocal_real_d, 0.0);
365+
Kokkos::deep_copy(cf_markers_nonlocal_d, 0.0);
357366

358367
Kokkos::parallel_for(
359368
Kokkos::TeamPolicy<>(PetscGetKokkosExecutionSpace(), local_rows, Kokkos::AUTO()),
@@ -363,27 +372,27 @@ PETSC_INTERN void pmisr_kokkos(Mat *strength_mat, const int max_luby_steps, cons
363372
const PetscInt i = t.league_rank();
364373

365374
// Check if this node has been assigned during this top loop
366-
if (cf_markers_local_real_d(i) == loops_through)
375+
if (cf_markers_d(i) == loops_through)
367376
{
368-
const PetscInt i = t.league_rank();
369377
PetscInt ncols_nonlocal = device_nonlocal_i[i + 1] - device_nonlocal_i[i];
370378

371379
// For over nonlocal columns
372380
Kokkos::parallel_for(
373381
Kokkos::TeamThreadRange(t, ncols_nonlocal), [&](const PetscInt j) {
374382

375383
// Needs to be atomic as may being set by many threads
376-
Kokkos::atomic_store(&cf_markers_nonlocal_real_d(device_nonlocal_j[device_nonlocal_i[i] + j]), 1.0);
384+
Kokkos::atomic_store(&cf_markers_nonlocal_d(device_nonlocal_j[device_nonlocal_i[i] + j]), 1.0);
377385
});
378386
}
379387
});
380388

381389
// We've updated the values in cf_markers_nonlocal
382390
// Calling a reverse scatter add will then update the values of cf_markers_local
383391
// Reduce with a sum, equivalent to VecScatterBegin with ADD_VALUES, SCATTER_REVERSE
384-
PetscSFReduceWithMemTypeBegin(mat_mpi->Mvctx, MPIU_SCALAR,
385-
mem_type, cf_markers_nonlocal_real_d_ptr,
386-
mem_type, cf_markers_local_real_d_ptr,
392+
// Be careful these aren't petscints
393+
PetscSFReduceWithMemTypeBegin(mat_mpi->Mvctx, MPI_INT,
394+
mem_type, cf_markers_nonlocal_d_ptr,
395+
mem_type, cf_markers_d_ptr,
387396
MPIU_SUM);
388397
}
389398

@@ -396,25 +405,27 @@ PETSC_INTERN void pmisr_kokkos(Mat *strength_mat, const int max_luby_steps, cons
396405
const PetscInt i = t.league_rank();
397406

398407
// Check if this node has been assigned during this top loop
399-
if (cf_markers_local_real_d(i) == loops_through)
408+
if (cf_markers_d(i) == loops_through)
400409
{
401-
const PetscInt i = t.league_rank();
402410
const PetscInt ncols_local = device_local_i[i + 1] - device_local_i[i];
403411

404412
// For over nonlocal columns
405413
Kokkos::parallel_for(
406414
Kokkos::TeamThreadRange(t, ncols_local), [&](const PetscInt j) {
407415

408416
// Needs to be atomic as may being set by many threads
409-
Kokkos::atomic_store(&cf_markers_local_real_d(device_local_j[device_local_i[i] + j]), 1.0);
417+
// Tried a version where instead of a "push" approach I tried a pull approach
418+
// that doesn't need an atomic, but it was slower
419+
Kokkos::atomic_store(&cf_markers_d(device_local_j[device_local_i[i] + j]), 1.0);
410420
});
411421
}
412422
});
413423

414424
if (mpi)
415425
{
416426
// Finish the scatter
417-
PetscSFReduceEnd(mat_mpi->Mvctx, MPIU_SCALAR, cf_markers_nonlocal_real_d_ptr, cf_markers_local_real_d_ptr, MPIU_SUM);
427+
// Be careful these aren't petscints
428+
PetscSFReduceEnd(mat_mpi->Mvctx, MPI_INT, cf_markers_nonlocal_d_ptr, cf_markers_d_ptr, MPIU_SUM);
418429
}
419430

420431
// We've done another top level loop
@@ -427,7 +438,7 @@ PETSC_INTERN void pmisr_kokkos(Mat *strength_mat, const int max_luby_steps, cons
427438

428439
counter_undecided = 0;
429440
Kokkos::parallel_reduce ("ReductionCounter_undecided", local_rows, KOKKOS_LAMBDA (const PetscInt i, PetscInt& update) {
430-
if (cf_markers_local_real_d(i) == 0) update++;
441+
if (cf_markers_d(i) == 0) update++;
431442
}, counter_undecided);
432443

433444
// Parallel reduction!
@@ -442,17 +453,14 @@ PETSC_INTERN void pmisr_kokkos(Mat *strength_mat, const int max_luby_steps, cons
442453
// Now assign our final cf markers
443454
// ~~~~~~~~~
444455

445-
// Can't use the global directly within the parallel
446-
// regions on the device
447-
intKokkosView cf_markers_d = cf_markers_local_d;
448456
Kokkos::parallel_for(
449457
Kokkos::RangePolicy<>(0, local_rows), KOKKOS_LAMBDA(PetscInt i) {
450458

451-
if (cf_markers_local_real_d(i) == 0)
459+
if (cf_markers_d(i) == 0)
452460
{
453461
cf_markers_d(i) = 1;
454462
}
455-
else if (cf_markers_local_real_d(i) < 0)
463+
else if (cf_markers_d(i) < 0)
456464
{
457465
cf_markers_d(i) = -1;
458466
}

0 commit comments

Comments
 (0)