You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
Copy file name to clipboardExpand all lines: src/ParallelKernel/CUDAExt/shared.jl
+19-1Lines changed: 19 additions & 1 deletion
Original file line number
Diff line number
Diff line change
@@ -28,4 +28,22 @@ let
28
28
while (id >length(custreams)) push!(custreams, CuStream(; flags=CUDA.STREAM_NON_BLOCKING, priority=CUDA.priority_range()[1])) end# CUDA.priority_range()[1] is min priority. # NOTE: priority_range cannot be called outside the function as only at runtime sure that CUDA is functional.
29
29
return custreams[id]
30
30
end
31
-
end
31
+
end
32
+
33
+
34
+
## FUNCTIONS TO QUERY DEVICE PROPERTIES
35
+
36
+
function ParallelStencil.ParallelKernel.get_cuda_compute_capability(default::VersionNumber)
@warn"Could not determine CUDA compute capability: assuming a recent architecture. Set the environment variable PS_CUDA_COMPUTE_CAPABILITY to a specific value if desired."
$tx = ((ParallelStencil.ParallelKernel.@blockIdx().x-1) * ParallelStencil.ParallelKernel.@blockDim().x +1) + ParallelStencil.ParallelKernel.@threadIdx().x -1; # thread ID, dimension x #NOTE: the addition and subtraction is a trick to reduce register pressure due to Int64 indexing; normally it would simply be: $tx = (ParallelStencil.ParallelKernel.@blockIdx().x-1) * ParallelStencil.ParallelKernel.@blockDim().x + ParallelStencil.ParallelKernel.@threadIdx().x; # thread ID, dimension x
559
559
$thread_bounds_check
560
560
$ix =$range_x[$tx] # index, dimension x
561
561
$block
@@ -570,8 +570,8 @@ function add_threadids(indices::Array, ranges::Array, block::Expr)
$tx = ((ParallelStencil.ParallelKernel.@blockIdx().x-1) * ParallelStencil.ParallelKernel.@blockDim().x +1) + ParallelStencil.ParallelKernel.@threadIdx().x -1; # thread ID, dimension x #NOTE: the addition and subtraction is a trick to reduce register pressure due to Int64 indexing; normally it would simply be: $tx = (ParallelStencil.ParallelKernel.@blockIdx().x-1) * ParallelStencil.ParallelKernel.@blockDim().x + ParallelStencil.ParallelKernel.@threadIdx().x; # thread ID, dimension x
574
+
$ty = ((ParallelStencil.ParallelKernel.@blockIdx().y-1) * ParallelStencil.ParallelKernel.@blockDim().y +1) + ParallelStencil.ParallelKernel.@threadIdx().y -1; # thread ID, dimension y #NOTE: the addition and subtraction is a trick to reduce register pressure due to Int64 indexing; normally it would simply be: $ty = (ParallelStencil.ParallelKernel.@blockIdx().y-1) * ParallelStencil.ParallelKernel.@blockDim().y + ParallelStencil.ParallelKernel.@threadIdx().y; # thread ID, dimension y
575
575
$thread_bounds_check
576
576
$ix =$range_x[$tx] # index, dimension x
577
577
$iy =$range_y[$ty] # index, dimension y
@@ -588,9 +588,9 @@ function add_threadids(indices::Array, ranges::Array, block::Expr)
$tx = ((ParallelStencil.ParallelKernel.@blockIdx().x-1) * ParallelStencil.ParallelKernel.@blockDim().x +1) + ParallelStencil.ParallelKernel.@threadIdx().x -1; # thread ID, dimension x #NOTE: the addition and subtraction is a trick to reduce register pressure due to Int64 indexing; normally it would simply be: $tx = (ParallelStencil.ParallelKernel.@blockIdx().x-1) * ParallelStencil.ParallelKernel.@blockDim().x + ParallelStencil.ParallelKernel.@threadIdx().x; # thread ID, dimension x
592
+
$ty = ((ParallelStencil.ParallelKernel.@blockIdx().y-1) * ParallelStencil.ParallelKernel.@blockDim().y +1) + ParallelStencil.ParallelKernel.@threadIdx().y -1; # thread ID, dimension y #NOTE: the addition and subtraction is a trick to reduce register pressure due to Int64 indexing; normally it would simply be: $ty = (ParallelStencil.ParallelKernel.@blockIdx().y-1) * ParallelStencil.ParallelKernel.@blockDim().y + ParallelStencil.ParallelKernel.@threadIdx().y; # thread ID, dimension y
593
+
$tz = ((ParallelStencil.ParallelKernel.@blockIdx().z-1) * ParallelStencil.ParallelKernel.@blockDim().z +1) + ParallelStencil.ParallelKernel.@threadIdx().z -1; # thread ID, dimension z #NOTE: the addition and subtraction is a trick to reduce register pressure due to Int64 indexing; normally it would simply be: $tz = (ParallelStencil.ParallelKernel.@blockIdx().z-1) * ParallelStencil.ParallelKernel.@blockDim().z + ParallelStencil.ParallelKernel.@threadIdx().z; # thread ID, dimension z
const COMPUTE_CAPABILITY_DEFAULT =v"∞"# having it infinity if it is not set allows to directly use statements like `if compute_capability < v"8"`, assuming a recent architecture if it is not set.
if (length(optvars)==0) @IncoherentArgumentError("incoherent argument memopt in @parallel[_indices] <kernel>: optimization can only be applied if there is at least one array that is read-only within the kernel (and accessed with a multi-point stencil). Set memopt=false for this kernel.") end
@@ -125,7 +126,7 @@ function memopt(metadata_module::Module, is_parallel_kernel::Bool, caller::Modul
125
126
126
127
#TODO: replace wrap_if where possible with in-line if - compare performance when doing it
127
128
body =quote
128
-
$loopoffset = (@blockIdx().z-1)*$loopsize #TODO: MOVE UP - see no perf change! interchange other lines!
129
+
$loopoffset = (@blockIdx().z-1)*$loopsize +$range_z_start-1#TODO: MOVE UP - see no perf change! interchange other lines!
129
130
$((quote
130
131
$tx =@threadIdx().x +$hx1
131
132
$ty =@threadIdx().y +$hy1
@@ -164,9 +165,12 @@ $((:( $reg = 0.0
164
165
# for $i = $loopstart:$(mainloopstart-1)
165
166
$(wrap_loop(i, loopstart:mainloopstart-1,
166
167
quote
167
-
$tz_g =$i +$loopoffset
168
-
if ($tz_g >$rangelength_z) ParallelStencil.@return_nothing; end
169
-
$iz = ($tz_g <1) ?$range_z_start-(1-$tz_g) :$range_z #TODO: this will probably always be formulated with range_z_start
168
+
$iz =$i +$loopoffset
169
+
if ($iz >$range_z_end) ParallelStencil.@return_nothing; end
170
+
# NOTE: the following is now fully included in the loopoffset (0.25% performance gain measured on H100) but is still of interest if we implement step ranges:
171
+
# $tz_g = $i + $loopoffset
172
+
# if ($tz_g > $rangelength_z) ParallelStencil.@return_nothing; end
173
+
# $iz = ($tz_g < 1) ? $range_z_start-(1-$tz_g) : $range_z # TODO: this will probably always be formulated with range_z_start
@@ -212,9 +216,12 @@ $(( # NOTE: the if statement is not needed here as we only deal with registers
212
216
# for $i = $mainloopstart:$mainloopend # ParallelStencil.@unroll
213
217
$(wrap_loop(i, mainloopstart:mainloopend,
214
218
quote
215
-
$tz_g =$i +$loopoffset
216
-
if ($tz_g >$rangelength_z) ParallelStencil.@return_nothing; end
217
-
$iz = ($tz_g <1) ?$range_z_start-(1-$tz_g) :$range_z #TODO: this will probably always be formulated with range_z_start
219
+
$iz =$i +$loopoffset
220
+
if ($iz >$range_z_end) ParallelStencil.@return_nothing; end
221
+
# NOTE: the following is now fully included in the loopoffset (0.25% performance gain measured on H100) but is still of interest if we implement step ranges:
222
+
# $tz_g = $i + $loopoffset
223
+
# if ($tz_g > $rangelength_z) ParallelStencil.@return_nothing; end
224
+
# $iz = ($tz_g < 1) ? $range_z_start-(1-$tz_g) : $range_z # TODO: this will probably always be formulated with range_z_start
@@ -545,7 +552,8 @@ function remove_single_point_optvars(optvars, optranges_arg, offsets, offsets_by
545
552
returntuple((A for A in optvars if!(length(keys(offsets[A]))==1&&length(keys(offsets_by_z[A]))==1) || (!isnothing(optranges_arg) && A ∈keys(optranges_arg)))...)
0 commit comments