@@ -88,23 +88,26 @@ PETSC_INTERN void pmisr_kokkos(Mat *strength_mat, const int max_luby_steps, cons
88
88
89
89
// Device memory for the global variable cf_markers_local_d - be careful these aren't petsc ints
90
90
cf_markers_local_d = intKokkosView (" cf_markers_local_d" , local_rows);
91
- PetscScalar *cf_markers_local_real_d_ptr = NULL , *cf_markers_nonlocal_real_d_ptr = NULL ;
92
- // The real equivalents so we can use the existing petscsf from the input matrix
93
- PetscScalarKokkosView cf_markers_local_real_d (" cf_markers_local_real_d" , local_rows);
94
- PetscScalarKokkosView cf_markers_nonlocal_real_d;
95
- cf_markers_local_real_d_ptr = cf_markers_local_real_d.data ();
91
+ // Can't use the global directly within the parallel
92
+ // regions on the device so just take a shallow copy
93
+ intKokkosView cf_markers_d = cf_markers_local_d;
94
+
95
+ intKokkosView cf_markers_nonlocal_d;
96
+ int *cf_markers_d_ptr = NULL , *cf_markers_nonlocal_d_ptr = NULL ;
97
+ cf_markers_d_ptr = cf_markers_d.data ();
96
98
97
99
// Host and device memory for the measure
98
100
PetscScalarKokkosViewHost measure_local_h (measure_local, local_rows);
99
101
PetscScalarKokkosView measure_local_d (" measure_local_d" , local_rows);
100
102
PetscScalar *measure_local_d_ptr = NULL , *measure_nonlocal_d_ptr = NULL ;
101
103
measure_local_d_ptr = measure_local_d.data ();
102
104
PetscScalarKokkosView measure_nonlocal_d;
105
+
103
106
if (mpi) {
104
107
measure_nonlocal_d = PetscScalarKokkosView (" measure_nonlocal_d" , cols_ao);
105
108
measure_nonlocal_d_ptr = measure_nonlocal_d.data ();
106
- cf_markers_nonlocal_real_d = PetscScalarKokkosView ( " cf_markers_nonlocal_real_d " , cols_ao);
107
- cf_markers_nonlocal_real_d_ptr = cf_markers_nonlocal_real_d .data ();
109
+ cf_markers_nonlocal_d = intKokkosView ( " cf_markers_nonlocal_d " , cols_ao);
110
+ cf_markers_nonlocal_d_ptr = cf_markers_nonlocal_d .data ();
108
111
}
109
112
110
113
// Device memory for the mark
@@ -153,43 +156,39 @@ PETSC_INTERN void pmisr_kokkos(Mat *strength_mat, const int max_luby_steps, cons
153
156
154
157
// Initialise the set
155
158
PetscInt counter_in_set_start = 0 ;
156
- // Count how many in the set to begin with
159
+ // Count how many in the set to begin with and set their CF markers
157
160
Kokkos::parallel_reduce (" Reduction" , local_rows, KOKKOS_LAMBDA (const PetscInt i, PetscInt& update) {
158
- if (Kokkos::abs (measure_local_d[i]) < 1 ) update++;
159
- }, counter_in_set_start);
160
-
161
- Kokkos::parallel_for (
162
- Kokkos::RangePolicy<>(0 , local_rows), KOKKOS_LAMBDA (PetscInt i) {
163
-
164
- if (Kokkos::abs (measure_local_d (i)) < 1 )
161
+ if (Kokkos::abs (measure_local_d[i]) < 1 )
165
162
{
166
163
if (zero_measure_c_point_int == 1 ) {
167
164
if (pmis_int == 1 ) {
168
165
// Set as F here but reversed below to become C
169
- cf_markers_local_real_d (i) = -1 ;
166
+ cf_markers_d (i) = -1 ;
170
167
}
171
168
else {
172
169
// Becomes C
173
- cf_markers_local_real_d (i) = 1 ;
170
+ cf_markers_d (i) = 1 ;
174
171
}
175
172
}
176
173
else {
177
174
if (pmis_int == 1 ) {
178
175
// Set as C here but reversed below to become F
179
176
// Otherwise dirichlet conditions persist down onto the coarsest grid
180
- cf_markers_local_real_d (i) = 1 ;
177
+ cf_markers_d (i) = 1 ;
181
178
}
182
179
else {
183
180
// Becomes F
184
- cf_markers_local_real_d (i) = -1 ;
181
+ cf_markers_d (i) = -1 ;
185
182
}
186
- }
183
+ }
184
+ // Count
185
+ update++;
187
186
}
188
187
else
189
188
{
190
- cf_markers_local_real_d (i) = 0 ;
191
- }
192
- });
189
+ cf_markers_d (i) = 0 ;
190
+ }
191
+ }, counter_in_set_start);
193
192
194
193
// Check the total number of undecided in parallel
195
194
PetscInt counter_undecided, counter_parallel;
@@ -238,24 +237,17 @@ PETSC_INTERN void pmisr_kokkos(Mat *strength_mat, const int max_luby_steps, cons
238
237
// Start the async scatter of the nonlocal cf_markers
239
238
// ~~~~~~~~~
240
239
if (mpi) {
241
- // We can't overwrite any of the values in cf_markers_local_real_d while the forward scatter is still going
242
- PetscSFBcastWithMemTypeBegin (mat_mpi->Mvctx , MPIU_SCALAR,
243
- mem_type, cf_markers_local_real_d_ptr,
244
- mem_type, cf_markers_nonlocal_real_d_ptr,
240
+ // We can't overwrite any of the values in cf_markers_d while the forward scatter is still going
241
+ // Be careful these aren't petscints
242
+ PetscSFBcastWithMemTypeBegin (mat_mpi->Mvctx , MPI_INT,
243
+ mem_type, cf_markers_d_ptr,
244
+ mem_type, cf_markers_nonlocal_d_ptr,
245
245
MPI_REPLACE);
246
246
}
247
247
248
- // This keeps track of which of the candidate nodes can become in the set
248
+ // mark_d keeps track of which of the candidate nodes can become in the set
249
249
// Only need this because we want to do async comms so we need a way to trigger
250
250
// a node not being in the set due to either strong local neighbours *or* strong offproc neighbours
251
- Kokkos::deep_copy (mark_d, true );
252
-
253
- // Any that aren't zero cf marker are already assigned so set to to false
254
- Kokkos::parallel_for (
255
- Kokkos::RangePolicy<>(0 , local_rows), KOKKOS_LAMBDA (PetscInt i) {
256
-
257
- if (cf_markers_local_real_d (i) != 0 ) mark_d (i) = false ;
258
- });
259
251
260
252
// ~~~~~~~~
261
253
// Go and do the local component
@@ -269,9 +261,8 @@ PETSC_INTERN void pmisr_kokkos(Mat *strength_mat, const int max_luby_steps, cons
269
261
PetscInt strong_neighbours = 0 ;
270
262
271
263
// Check this row isn't already marked
272
- if (cf_markers_local_real_d (i) == 0 )
264
+ if (cf_markers_d (i) == 0 )
273
265
{
274
- const PetscInt i = t.league_rank ();
275
266
const PetscInt ncols_local = device_local_i[i + 1 ] - device_local_i[i];
276
267
277
268
// Reduce over local columns to get the number of strong neighbours
@@ -281,7 +272,7 @@ PETSC_INTERN void pmisr_kokkos(Mat *strength_mat, const int max_luby_steps, cons
281
272
282
273
// Have to only check active strong neighbours
283
274
if (measure_local_d (i) >= measure_local_d (device_local_j[device_local_i[i] + j]) && \
284
- cf_markers_local_real_d (device_local_j[device_local_i[i] + j]) == 0 )
275
+ cf_markers_d (device_local_j[device_local_i[i] + j]) == 0 )
285
276
{
286
277
strong_count++;
287
278
}
@@ -292,7 +283,22 @@ PETSC_INTERN void pmisr_kokkos(Mat *strength_mat, const int max_luby_steps, cons
292
283
// Only want one thread in the team to write the result
293
284
Kokkos::single (Kokkos::PerTeam (t), [&]() {
294
285
// If we have any strong neighbours
295
- if (strong_neighbours > 0 ) mark_d (i) = false ;
286
+ if (strong_neighbours > 0 )
287
+ {
288
+ mark_d (i) = false ;
289
+ }
290
+ else
291
+ {
292
+ mark_d (i) = true ;
293
+ }
294
+ });
295
+ }
296
+ // Any that aren't zero cf marker are already assigned so set to to false
297
+ else
298
+ {
299
+ // Only want one thread in the team to write the result
300
+ Kokkos::single (Kokkos::PerTeam (t), [&]() {
301
+ mark_d (i) = false ;
296
302
});
297
303
}
298
304
});
@@ -303,7 +309,8 @@ PETSC_INTERN void pmisr_kokkos(Mat *strength_mat, const int max_luby_steps, cons
303
309
if (mpi) {
304
310
305
311
// Finish the async scatter
306
- PetscSFBcastEnd (mat_mpi->Mvctx , MPIU_SCALAR, cf_markers_local_real_d_ptr, cf_markers_nonlocal_real_d_ptr, MPI_REPLACE);
312
+ // Be careful these aren't petscints
313
+ PetscSFBcastEnd (mat_mpi->Mvctx , MPI_INT, cf_markers_d_ptr, cf_markers_nonlocal_d_ptr, MPI_REPLACE);
307
314
308
315
Kokkos::parallel_for (
309
316
Kokkos::TeamPolicy<>(PetscGetKokkosExecutionSpace (), local_rows, Kokkos::AUTO ()),
@@ -314,9 +321,8 @@ PETSC_INTERN void pmisr_kokkos(Mat *strength_mat, const int max_luby_steps, cons
314
321
PetscInt strong_neighbours = 0 ;
315
322
316
323
// Check this row isn't already marked
317
- if (cf_markers_local_real_d (i) == 0 )
324
+ if (mark_d (i))
318
325
{
319
- const PetscInt i = t.league_rank ();
320
326
PetscInt ncols_nonlocal = device_nonlocal_i[i + 1 ] - device_nonlocal_i[i];
321
327
322
328
// Reduce over nonlocal columns to get the number of strong neighbours
@@ -325,7 +331,7 @@ PETSC_INTERN void pmisr_kokkos(Mat *strength_mat, const int max_luby_steps, cons
325
331
[&](const PetscInt j, PetscInt& strong_count) {
326
332
327
333
if (measure_local_d (i) >= measure_nonlocal_d (device_nonlocal_j[device_nonlocal_i[i] + j]) && \
328
- cf_markers_nonlocal_real_d (device_nonlocal_j[device_nonlocal_i[i] + j]) == 0 )
334
+ cf_markers_nonlocal_d (device_nonlocal_j[device_nonlocal_i[i] + j]) == 0 )
329
335
{
330
336
strong_count++;
331
337
}
@@ -335,25 +341,28 @@ PETSC_INTERN void pmisr_kokkos(Mat *strength_mat, const int max_luby_steps, cons
335
341
336
342
// Only want one thread in the team to write the result
337
343
Kokkos::single (Kokkos::PerTeam (t), [&]() {
338
- // If we have any strong neighbours
339
- if (strong_neighbours > 0 ) mark_d (i) = false ;
344
+ // If we don't have any strong neighbours
345
+ if (strong_neighbours == 0 ) cf_markers_d (i) = loops_through;
340
346
});
341
347
}
342
348
});
343
349
}
350
+ // This cf_markers_d(i) = loops_through happens above in the case of mpi, saves a kernel launch
351
+ else
352
+ {
353
+ // The nodes that have mark equal to true have no strong active neighbours in the IS
354
+ // hence they can be in the IS
355
+ Kokkos::parallel_for (
356
+ Kokkos::RangePolicy<>(0 , local_rows), KOKKOS_LAMBDA (PetscInt i) {
344
357
345
- // The nodes that have mark equal to true have no strong active neighbours in the IS
346
- // hence they can be in the IS
347
- Kokkos::parallel_for (
348
- Kokkos::RangePolicy<>(0 , local_rows), KOKKOS_LAMBDA (PetscInt i) {
349
-
350
- if (mark_d (i)) cf_markers_local_real_d (i) = double (loops_through);
351
- });
358
+ if (mark_d (i)) cf_markers_d (i) = loops_through;
359
+ });
360
+ }
352
361
353
362
if (mpi)
354
363
{
355
364
// We're going to do an add reverse scatter, so set them to zero
356
- Kokkos::deep_copy (cf_markers_nonlocal_real_d , 0.0 );
365
+ Kokkos::deep_copy (cf_markers_nonlocal_d , 0.0 );
357
366
358
367
Kokkos::parallel_for (
359
368
Kokkos::TeamPolicy<>(PetscGetKokkosExecutionSpace (), local_rows, Kokkos::AUTO ()),
@@ -363,27 +372,27 @@ PETSC_INTERN void pmisr_kokkos(Mat *strength_mat, const int max_luby_steps, cons
363
372
const PetscInt i = t.league_rank ();
364
373
365
374
// Check if this node has been assigned during this top loop
366
- if (cf_markers_local_real_d (i) == loops_through)
375
+ if (cf_markers_d (i) == loops_through)
367
376
{
368
- const PetscInt i = t.league_rank ();
369
377
PetscInt ncols_nonlocal = device_nonlocal_i[i + 1 ] - device_nonlocal_i[i];
370
378
371
379
// For over nonlocal columns
372
380
Kokkos::parallel_for (
373
381
Kokkos::TeamThreadRange (t, ncols_nonlocal), [&](const PetscInt j) {
374
382
375
383
// Needs to be atomic as may being set by many threads
376
- Kokkos::atomic_store (&cf_markers_nonlocal_real_d (device_nonlocal_j[device_nonlocal_i[i] + j]), 1.0 );
384
+ Kokkos::atomic_store (&cf_markers_nonlocal_d (device_nonlocal_j[device_nonlocal_i[i] + j]), 1.0 );
377
385
});
378
386
}
379
387
});
380
388
381
389
// We've updated the values in cf_markers_nonlocal
382
390
// Calling a reverse scatter add will then update the values of cf_markers_local
383
391
// Reduce with a sum, equivalent to VecScatterBegin with ADD_VALUES, SCATTER_REVERSE
384
- PetscSFReduceWithMemTypeBegin (mat_mpi->Mvctx , MPIU_SCALAR,
385
- mem_type, cf_markers_nonlocal_real_d_ptr,
386
- mem_type, cf_markers_local_real_d_ptr,
392
+ // Be careful these aren't petscints
393
+ PetscSFReduceWithMemTypeBegin (mat_mpi->Mvctx , MPI_INT,
394
+ mem_type, cf_markers_nonlocal_d_ptr,
395
+ mem_type, cf_markers_d_ptr,
387
396
MPIU_SUM);
388
397
}
389
398
@@ -396,25 +405,27 @@ PETSC_INTERN void pmisr_kokkos(Mat *strength_mat, const int max_luby_steps, cons
396
405
const PetscInt i = t.league_rank ();
397
406
398
407
// Check if this node has been assigned during this top loop
399
- if (cf_markers_local_real_d (i) == loops_through)
408
+ if (cf_markers_d (i) == loops_through)
400
409
{
401
- const PetscInt i = t.league_rank ();
402
410
const PetscInt ncols_local = device_local_i[i + 1 ] - device_local_i[i];
403
411
404
412
// For over nonlocal columns
405
413
Kokkos::parallel_for (
406
414
Kokkos::TeamThreadRange (t, ncols_local), [&](const PetscInt j) {
407
415
408
416
// Needs to be atomic as may being set by many threads
409
- Kokkos::atomic_store (&cf_markers_local_real_d (device_local_j[device_local_i[i] + j]), 1.0 );
417
+ // Tried a version where instead of a "push" approach I tried a pull approach
418
+ // that doesn't need an atomic, but it was slower
419
+ Kokkos::atomic_store (&cf_markers_d (device_local_j[device_local_i[i] + j]), 1.0 );
410
420
});
411
421
}
412
422
});
413
423
414
424
if (mpi)
415
425
{
416
426
// Finish the scatter
417
- PetscSFReduceEnd (mat_mpi->Mvctx , MPIU_SCALAR, cf_markers_nonlocal_real_d_ptr, cf_markers_local_real_d_ptr, MPIU_SUM);
427
+ // Be careful these aren't petscints
428
+ PetscSFReduceEnd (mat_mpi->Mvctx , MPI_INT, cf_markers_nonlocal_d_ptr, cf_markers_d_ptr, MPIU_SUM);
418
429
}
419
430
420
431
// We've done another top level loop
@@ -427,7 +438,7 @@ PETSC_INTERN void pmisr_kokkos(Mat *strength_mat, const int max_luby_steps, cons
427
438
428
439
counter_undecided = 0 ;
429
440
Kokkos::parallel_reduce (" ReductionCounter_undecided" , local_rows, KOKKOS_LAMBDA (const PetscInt i, PetscInt& update) {
430
- if (cf_markers_local_real_d (i) == 0 ) update++;
441
+ if (cf_markers_d (i) == 0 ) update++;
431
442
}, counter_undecided);
432
443
433
444
// Parallel reduction!
@@ -442,17 +453,14 @@ PETSC_INTERN void pmisr_kokkos(Mat *strength_mat, const int max_luby_steps, cons
442
453
// Now assign our final cf markers
443
454
// ~~~~~~~~~
444
455
445
- // Can't use the global directly within the parallel
446
- // regions on the device
447
- intKokkosView cf_markers_d = cf_markers_local_d;
448
456
Kokkos::parallel_for (
449
457
Kokkos::RangePolicy<>(0 , local_rows), KOKKOS_LAMBDA (PetscInt i) {
450
458
451
- if (cf_markers_local_real_d (i) == 0 )
459
+ if (cf_markers_d (i) == 0 )
452
460
{
453
461
cf_markers_d (i) = 1 ;
454
462
}
455
- else if (cf_markers_local_real_d (i) < 0 )
463
+ else if (cf_markers_d (i) < 0 )
456
464
{
457
465
cf_markers_d (i) = -1 ;
458
466
}
0 commit comments