Merge pull request #185 from omlins/memoptparams

omlins · web-flow · commit 2d2559378a47 · 2025-07-14T20:31:42.000+02:00
Set memopt parameters in function of compute capability
diff --git a/src/ParallelKernel/AMDGPUExt/defaults.jl b/src/ParallelKernel/AMDGPUExt/defaults.jl
@@ -5,6 +5,7 @@ const ERRMSG_AMDGPUEXT_NOT_LOADED = "the AMDGPU extension was not loaded. Make s
 
 function get_priority_rocstream end
 function get_rocstream end
+function get_amdgpu_compute_capability end
 
 
 # allocators.jl
diff --git a/src/ParallelKernel/AMDGPUExt/shared.jl b/src/ParallelKernel/AMDGPUExt/shared.jl
@@ -28,4 +28,13 @@ let
         while (id > length(rocstreams)) push!(rocstreams, AMDGPU.HIPStream(:low)) end
         return rocstreams[id]
     end
-end
+end
+
+
+## FUNCTIONS TO QUERY DEVICE PROPERTIES
+
+function ParallelStencil.ParallelKernel.get_amdgpu_compute_capability(default::VersionNumber)
+    compute_capability = default
+    #TODO: implement and convert to something comparable to CUDA compute capability.
+    return compute_capability
+end
diff --git a/src/ParallelKernel/CUDAExt/defaults.jl b/src/ParallelKernel/CUDAExt/defaults.jl
@@ -5,6 +5,7 @@ const ERRMSG_CUDAEXT_NOT_LOADED = "the CUDA extension was not loaded. Make sure
 
 function get_priority_custream end
 function get_custream end
+function get_cuda_compute_capability end
 
 
 # allocators.jl
@@ -15,4 +16,4 @@ rand_cuda(arg...)   = @NotLoadedError(ERRMSG_CUDAEXT_NOT_LOADED)
 falses_cuda(arg...) = @NotLoadedError(ERRMSG_CUDAEXT_NOT_LOADED)
 trues_cuda(arg...)  = @NotLoadedError(ERRMSG_CUDAEXT_NOT_LOADED)
 fill_cuda(arg...)   = @NotLoadedError(ERRMSG_CUDAEXT_NOT_LOADED)
-fill_cuda!(arg...)  = @NotLoadedError(ERRMSG_CUDAEXT_NOT_LOADED)
+fill_cuda!(arg...)  = @NotLoadedError(ERRMSG_CUDAEXT_NOT_LOADED)
diff --git a/src/ParallelKernel/CUDAExt/shared.jl b/src/ParallelKernel/CUDAExt/shared.jl
@@ -28,4 +28,22 @@ let
         while (id > length(custreams)) push!(custreams, CuStream(; flags=CUDA.STREAM_NON_BLOCKING, priority=CUDA.priority_range()[1])) end # CUDA.priority_range()[1] is min priority. # NOTE: priority_range cannot be called outside the function as only at runtime sure that CUDA is functional.
         return custreams[id]
     end
-end
+end
+
+
+## FUNCTIONS TO QUERY DEVICE PROPERTIES
+
+function ParallelStencil.ParallelKernel.get_cuda_compute_capability(default::VersionNumber)
+    compute_capability = default
+    if haskey(ENV, "PS_CUDA_COMPUTE_CAPABILITY")
+        compute_capability = parse(VersionNumber, ENV["PS_CUDA_COMPUTE_CAPABILITY"])
+    else
+        try
+            dev = CUDA.device()
+            compute_capability = CUDA.capability(dev)
+        catch e
+            @warn "Could not determine CUDA compute capability: assuming a recent architecture. Set the environment variable PS_CUDA_COMPUTE_CAPABILITY to a specific value if desired."
+        end
+    end
+    return compute_capability
+end
diff --git a/src/ParallelKernel/MetalExt/defaults.jl b/src/ParallelKernel/MetalExt/defaults.jl
@@ -4,6 +4,8 @@ const ERRMSG_METALEXT_NOT_LOADED = "the Metal extension was not loaded. Make sur
 
 function get_priority_metalstream end
 function get_metalstream end
+function get_metal_compute_capability end
+
 
 # allocators
 
@@ -14,5 +16,3 @@ falses_metal(arg...) = @NotLoadedError(ERRMSG_METALEXT_NOT_LOADED)
 trues_metal(arg...)  = @NotLoadedError(ERRMSG_METALEXT_NOT_LOADED)
 fill_metal(arg...)   = @NotLoadedError(ERRMSG_METALEXT_NOT_LOADED)
 fill_metal!(arg...)  = @NotLoadedError(ERRMSG_METALEXT_NOT_LOADED)
-
-
diff --git a/src/ParallelKernel/MetalExt/shared.jl b/src/ParallelKernel/MetalExt/shared.jl
@@ -6,10 +6,14 @@ import Metal.MTL
 
 @define_MtlCellArray
 
+
 ## FUNCTIONS TO CHECK EXTENSIONS SUPPORT
+
 ParallelStencil.ParallelKernel.is_loaded(::Val{:ParallelStencil_MetalExt}) = true
 
+
 ## FUNCTIONS TO GET CREATE AND MANAGE METAL QUEUES
+
 ParallelStencil.ParallelKernel.get_priority_metalstream(arg...) = get_priority_metalstream(arg...)
 ParallelStencil.ParallelKernel.get_metalstream(arg...)     = get_metalstream(arg...)
 
@@ -27,4 +31,13 @@ let
         while (id > length(metalqueues)) push!(metalqueues, MTL.MTLCommandQueue(Metal.device())) end
         return metalqueues[id]
     end
-end
+end
+
+
+## FUNCTIONS TO QUERY DEVICE PROPERTIES
+
+function ParallelStencil.ParallelKernel.get_metal_compute_capability(default::VersionNumber)
+    compute_capability = default
+    #TODO: implement and convert to something comparable to CUDA compute capability.
+    return compute_capability
+end
diff --git a/src/ParallelKernel/parallel.jl b/src/ParallelKernel/parallel.jl
@@ -555,7 +555,7 @@ function add_threadids(indices::Array, ranges::Array, block::Expr)
             end
         end
         quote
-            $tx = (ParallelStencil.ParallelKernel.@blockIdx().x-1) * ParallelStencil.ParallelKernel.@blockDim().x + ParallelStencil.ParallelKernel.@threadIdx().x;  # thread ID, dimension x
+            $tx = ((ParallelStencil.ParallelKernel.@blockIdx().x-1) * ParallelStencil.ParallelKernel.@blockDim().x + 1) + ParallelStencil.ParallelKernel.@threadIdx().x - 1;  # thread ID, dimension x  #NOTE: the addition and subtraction is a trick to reduce register pressure due to Int64 indexing; normally it would simply be: $tx = (ParallelStencil.ParallelKernel.@blockIdx().x-1) * ParallelStencil.ParallelKernel.@blockDim().x + ParallelStencil.ParallelKernel.@threadIdx().x;  # thread ID, dimension x
             $thread_bounds_check
             $ix = $range_x[$tx]                                                    # index, dimension x
             $block
@@ -570,8 +570,8 @@ function add_threadids(indices::Array, ranges::Array, block::Expr)
             end
         end
         quote
-            $tx = (ParallelStencil.ParallelKernel.@blockIdx().x-1) * ParallelStencil.ParallelKernel.@blockDim().x + ParallelStencil.ParallelKernel.@threadIdx().x;  # thread ID, dimension x
-            $ty = (ParallelStencil.ParallelKernel.@blockIdx().y-1) * ParallelStencil.ParallelKernel.@blockDim().y + ParallelStencil.ParallelKernel.@threadIdx().y;  # thread ID, dimension y
+            $tx = ((ParallelStencil.ParallelKernel.@blockIdx().x-1) * ParallelStencil.ParallelKernel.@blockDim().x + 1) + ParallelStencil.ParallelKernel.@threadIdx().x - 1;  # thread ID, dimension x  #NOTE: the addition and subtraction is a trick to reduce register pressure due to Int64 indexing; normally it would simply be: $tx = (ParallelStencil.ParallelKernel.@blockIdx().x-1) * ParallelStencil.ParallelKernel.@blockDim().x + ParallelStencil.ParallelKernel.@threadIdx().x;  # thread ID, dimension x
+            $ty = ((ParallelStencil.ParallelKernel.@blockIdx().y-1) * ParallelStencil.ParallelKernel.@blockDim().y + 1) + ParallelStencil.ParallelKernel.@threadIdx().y - 1;  # thread ID, dimension y  #NOTE: the addition and subtraction is a trick to reduce register pressure due to Int64 indexing; normally it would simply be: $ty = (ParallelStencil.ParallelKernel.@blockIdx().y-1) * ParallelStencil.ParallelKernel.@blockDim().y + ParallelStencil.ParallelKernel.@threadIdx().y;  # thread ID, dimension y
             $thread_bounds_check
             $ix = $range_x[$tx]                                                    # index, dimension x
             $iy = $range_y[$ty]                                                    # index, dimension y
@@ -588,9 +588,9 @@ function add_threadids(indices::Array, ranges::Array, block::Expr)
             end
         end
         quote
-            $tx = (ParallelStencil.ParallelKernel.@blockIdx().x-1) * ParallelStencil.ParallelKernel.@blockDim().x + ParallelStencil.ParallelKernel.@threadIdx().x;  # thread ID, dimension x
-            $ty = (ParallelStencil.ParallelKernel.@blockIdx().y-1) * ParallelStencil.ParallelKernel.@blockDim().y + ParallelStencil.ParallelKernel.@threadIdx().y;  # thread ID, dimension y
-            $tz = (ParallelStencil.ParallelKernel.@blockIdx().z-1) * ParallelStencil.ParallelKernel.@blockDim().z + ParallelStencil.ParallelKernel.@threadIdx().z;  # thread ID, dimension z
+            $tx = ((ParallelStencil.ParallelKernel.@blockIdx().x-1) * ParallelStencil.ParallelKernel.@blockDim().x + 1) + ParallelStencil.ParallelKernel.@threadIdx().x - 1;  # thread ID, dimension x  #NOTE: the addition and subtraction is a trick to reduce register pressure due to Int64 indexing; normally it would simply be: $tx = (ParallelStencil.ParallelKernel.@blockIdx().x-1) * ParallelStencil.ParallelKernel.@blockDim().x + ParallelStencil.ParallelKernel.@threadIdx().x;  # thread ID, dimension x
+            $ty = ((ParallelStencil.ParallelKernel.@blockIdx().y-1) * ParallelStencil.ParallelKernel.@blockDim().y + 1) + ParallelStencil.ParallelKernel.@threadIdx().y - 1;  # thread ID, dimension y  #NOTE: the addition and subtraction is a trick to reduce register pressure due to Int64 indexing; normally it would simply be: $ty = (ParallelStencil.ParallelKernel.@blockIdx().y-1) * ParallelStencil.ParallelKernel.@blockDim().y + ParallelStencil.ParallelKernel.@threadIdx().y;  # thread ID, dimension y
+            $tz = ((ParallelStencil.ParallelKernel.@blockIdx().z-1) * ParallelStencil.ParallelKernel.@blockDim().z + 1) + ParallelStencil.ParallelKernel.@threadIdx().z - 1;  # thread ID, dimension z  #NOTE: the addition and subtraction is a trick to reduce register pressure due to Int64 indexing; normally it would simply be: $tz = (ParallelStencil.ParallelKernel.@blockIdx().z-1) * ParallelStencil.ParallelKernel.@blockDim().z + ParallelStencil.ParallelKernel.@threadIdx().z;  # thread ID, dimension z
             $thread_bounds_check
             $ix = $range_x[$tx]                                                    # index, dimension x
             $iy = $range_y[$ty]                                                    # index, dimension y
diff --git a/src/ParallelKernel/shared.jl b/src/ParallelKernel/shared.jl
@@ -26,6 +26,7 @@ const INT_AMDGPU                   = Int64 # NOTE: ...
 const INT_METAL                    = Int64 # NOTE: ...
 const INT_POLYESTER                = Int64 # NOTE: ...
 const INT_THREADS                  = Int64 # NOTE: ...
+const COMPUTE_CAPABILITY_DEFAULT   = v"∞" # having it infinity if it is not set allows to directly use statements like `if compute_capability < v"8"`, assuming a recent architecture if it is not set.
 const NTHREADS_X_MAX               = 32
 const NTHREADS_X_MAX_AMDGPU        = 64
 const NTHREADS_MAX                 = 256
@@ -572,6 +573,23 @@ interpolate(sym::Symbol, vals_expr::Expr, block::Expr) = interpolate(sym, (extra
 quote_expr(expr) = :($(Expr(:quote, expr)))
 
 
+## FUNCTIONS TO QUERY DEVICE PROPERTIES
+
+function get_compute_capability(package::Symbol)
+    default = COMPUTE_CAPABILITY_DEFAULT
+    if     (package == PKG_CUDA)      get_cuda_compute_capability(default)
+    elseif (package == PKG_AMDGPU)    get_amdgpu_compute_capability(default)
+    elseif (package == PKG_METAL)     get_metal_compute_capability(default)
+    elseif (package == PKG_THREADS)   get_cpu_compute_capability(default)
+    elseif (package == PKG_POLYESTER) get_cpu_compute_capability(default)
+    else
+        @ArgumentError("$ERRMSG_UNSUPPORTED_PACKAGE (obtained: $package). Supported packages are: $(join(SUPPORTED_PACKAGES, ", ")).")
+    end
+end
+
+get_cpu_compute_capability(default::VersionNumber) = return default
+
+
 ## FUNCTIONS/MACROS FOR DIVERSE SYNTAX SUGAR
 
 iscpu(package) = return (package in (PKG_THREADS, PKG_POLYESTER))
diff --git a/src/kernel_language.jl b/src/kernel_language.jl
@@ -78,7 +78,7 @@ function memopt(metadata_module::Module, is_parallel_kernel::Bool, caller::Modul
     offsets, offsets_by_z = extract_offsets(caller, body, indices, int_type, optvars, loopdim)
     optvars               = remove_single_point_optvars(optvars, optranges, offsets, offsets_by_z)
     if (length(optvars)==0) @IncoherentArgumentError("incoherent argument memopt in @parallel[_indices] <kernel>: optimization can only be applied if there is at least one array that is read-only within the kernel (and accessed with a multi-point stencil). Set memopt=false for this kernel.") end
-    optranges             = define_optranges(optranges, optvars, offsets, int_type)
+    optranges             = define_optranges(optranges, optvars, offsets, int_type, package)
     regqueue_heads, regqueue_tails, offset_mins, offset_maxs, nb_regs_heads, nb_regs_tails = define_regqueues(offsets, optranges, optvars, indices, int_type, loopdim)
 
     if loopdim == 3
@@ -102,6 +102,7 @@ function memopt(metadata_module::Module, is_parallel_kernel::Bool, caller::Modul
         ranges             = RANGES_VARNAME
         range_z            = :(($ranges[3])[$tz_g])
         range_z_start      = :(($ranges[3])[1])
+        range_z_end        = :(($ranges[3])[end])
         i                  = gensym_world("i", @__MODULE__)
         loopoffset         = gensym_world("loopoffset", @__MODULE__)
 
@@ -125,7 +126,7 @@ function memopt(metadata_module::Module, is_parallel_kernel::Bool, caller::Modul
 
         #TODO: replace wrap_if where possible with in-line if - compare performance when doing it
         body = quote
-                    $loopoffset    = (@blockIdx().z-1)*$loopsize #TODO: MOVE UP - see no perf change! interchange other lines!
+                    $loopoffset    = (@blockIdx().z-1)*$loopsize + $range_z_start-1 #TODO: MOVE UP - see no perf change! interchange other lines!
 $((quote
                     $tx            = @threadIdx().x + $hx1
                     $ty            = @threadIdx().y + $hy1
@@ -164,9 +165,12 @@ $((:(               $reg           = 0.0
                     # for $i = $loopstart:$(mainloopstart-1)
 $(wrap_loop(i, loopstart:mainloopstart-1,
         quote
-                        $tz_g = $i + $loopoffset
-                        if ($tz_g > $rangelength_z) ParallelStencil.@return_nothing; end
-                        $iz = ($tz_g < 1) ? $range_z_start-(1-$tz_g) : $range_z # TODO: this will probably always be formulated with range_z_start
+                        $iz = $i + $loopoffset
+                        if ($iz > $range_z_end) ParallelStencil.@return_nothing; end
+                        # NOTE: the following is now fully included in the loopoffset (0.25% performance gain measured on H100) but is still of interest if we implement step ranges:
+                        # $tz_g = $i + $loopoffset
+                        # if ($tz_g > $rangelength_z) ParallelStencil.@return_nothing; end
+                        # $iz = ($tz_g < 1) ? $range_z_start-(1-$tz_g) : $range_z # TODO: this will probably always be formulated with range_z_start
 $((wrap_if(:($i > $(loopentry-1)),
        :(               $reg       = (0<$ix+$(oxy[1])<=size($A,1) && 0<$iy+$(oxy[2])<=size($A,2) && 0<$iz+$oz<=size($A,3)) ? $(regtarget(A, (oxy...,oz), indices)) : $reg
         )
@@ -212,9 +216,12 @@ $(( # NOTE: the if statement is not needed here as we only deal with registers
                     # for $i = $mainloopstart:$mainloopend # ParallelStencil.@unroll 
 $(wrap_loop(i, mainloopstart:mainloopend, 
         quote
-                        $tz_g = $i + $loopoffset
-                        if ($tz_g > $rangelength_z) ParallelStencil.@return_nothing; end
-                        $iz = ($tz_g < 1) ? $range_z_start-(1-$tz_g) : $range_z # TODO: this will probably always be formulated with range_z_start
+                        $iz = $i + $loopoffset
+                        if ($iz > $range_z_end) ParallelStencil.@return_nothing; end
+                        # NOTE: the following is now fully included in the loopoffset (0.25% performance gain measured on H100) but is still of interest if we implement step ranges:
+                        # $tz_g = $i + $loopoffset
+                        # if ($tz_g > $rangelength_z) ParallelStencil.@return_nothing; end
+                        # $iz = ($tz_g < 1) ? $range_z_start-(1-$tz_g) : $range_z # TODO: this will probably always be formulated with range_z_start
 $(use_any_shmem ? 
     :(                  @sync_threads()
      ) :                NOEXPR
@@ -468,7 +475,7 @@ end
 
 function memopt(metadata_module::Module, is_parallel_kernel::Bool, caller::Module, indices::Union{Symbol,Expr}, optvars::Union{Expr,Symbol}, body::Expr; package::Symbol=get_package(caller))
     loopdim            = isa(indices,Expr) ? length(indices.args) : 1
-    loopsize           = LOOPSIZE
+    loopsize           = compute_loopsize(package)
     optranges          = nothing
     use_shmemhalos     = nothing
     optimize_halo_read = true
@@ -545,7 +552,8 @@ function remove_single_point_optvars(optvars, optranges_arg, offsets, offsets_by
     return tuple((A for A in optvars if !(length(keys(offsets[A]))==1 && length(keys(offsets_by_z[A]))==1) || (!isnothing(optranges_arg) && A ∈ keys(optranges_arg)))...)
 end
 
-function define_optranges(optranges_arg, optvars, offsets, int_type)
+function define_optranges(optranges_arg, optvars, offsets, int_type, package)
+    compute_capability = get_compute_capability(package)
     optranges = Dict()
     for A in optvars
         zspan_max     = 0
@@ -560,12 +568,12 @@ function define_optranges(optranges_arg, optvars, offsets, int_type)
         fullrange    = typemin(int_type):typemax(int_type)
         pointrange_x = oxy_zspan_max[1]: oxy_zspan_max[1]
         pointrange_y = oxy_zspan_max[2]: oxy_zspan_max[2]
-        if     (!isnothing(optranges_arg) && A ∈ keys(optranges_arg)) optranges[A] = getproperty(optranges_arg, A)
-        elseif (length(optvars) <= FULLRANGE_THRESHOLD)               optranges[A] = (fullrange,    fullrange,    fullrange)
-        elseif (USE_FULLRANGE_DEFAULT == (true,  true,  true))        optranges[A] = (fullrange,    fullrange,    fullrange)
-        elseif (USE_FULLRANGE_DEFAULT == (false, true,  true))        optranges[A] = (pointrange_x, fullrange,    fullrange)
-        elseif (USE_FULLRANGE_DEFAULT == (true,  false, true))        optranges[A] = (fullrange,    pointrange_y, fullrange)
-        elseif (USE_FULLRANGE_DEFAULT == (false, false, true))        optranges[A] = (pointrange_x, pointrange_y, fullrange)
+        if     (!isnothing(optranges_arg) && A ∈ keys(optranges_arg))                   optranges[A] = getproperty(optranges_arg, A)
+        elseif (compute_capability < v"8" && (length(optvars) <= FULLRANGE_THRESHOLD))  optranges[A] = (fullrange,    fullrange,    fullrange)
+        elseif (USE_FULLRANGE_DEFAULT == (true,  true,  true))                          optranges[A] = (fullrange,    fullrange,    fullrange)
+        elseif (USE_FULLRANGE_DEFAULT == (false, true,  true))                          optranges[A] = (pointrange_x, fullrange,    fullrange)
+        elseif (USE_FULLRANGE_DEFAULT == (true,  false, true))                          optranges[A] = (fullrange,    pointrange_y, fullrange)
+        elseif (USE_FULLRANGE_DEFAULT == (false, false, true))                          optranges[A] = (pointrange_x, pointrange_y, fullrange)
         end
     end
     return optranges
diff --git a/src/parallel.jl b/src/parallel.jl
diff --git a/src/shared.jl b/src/shared.jl
diff --git a/test/test_parallel.jl b/test/test_parallel.jl