Description
Following #4034 I was able to reliably reproduce the error. When using a 16³ LatitudeLongitudeGrid
with Float32
on an NVIDIA RTX 4090 with --check-bounds=yes
(needed!) the MWE below produces the error below.
There is no error with a smaller 8³ grid. There is no error with Float64
. And there is no error on a RectilinearGrid
.
I was not able to reproduce on a V100. But I've seen this error show up when running simulations on an immersed LatitudeLongitudeGrid
with Float64
on a V100 and H100 without --check-bounds=yes
.
I will try to reproduce using just CUDA.jl. It's interesting that the error suggests that the RTX 4090 has a "Maximum number of threads per block" of 512 when CUDA deviceQuery says it's 1024.
MWE:
using Oceananigans
using Oceananigans.Advection: cell_advection_timescaleᶜᶜᶜ
grid = LatitudeLongitudeGrid(GPU(), Float32;
topology = (Bounded, Bounded, Bounded),
size = (16, 16, 16),
longitude = (-10, 10),
latitude = (-10, 10),
z = (-100, 0)
)
model = HydrostaticFreeSurfaceModel(; grid)
u, v, w = model.velocities
τ = KernelFunctionOperation{Center, Center, Center}(cell_advection_timescaleᶜᶜᶜ, grid, u, v, w)
τ_min = minimum(τ)
Error:
ERROR: Number of threads per block exceeds kernel limit (640 > 512).
Stacktrace:
[1] error(s::String)
@ Base ./error.jl:35
[2] diagnose_launch_failure(f::CUDA.CuFunction, err::CUDA.CuError; blockdim::CUDA.CuDim3, threaddim::CUDA.CuDim3, shmem::Int64)
@ CUDA ~/.julia/packages/CUDA/2kjXI/lib/cudadrv/execution.jl:120
[3] launch(::CUDA.CuFunction, ::CUDA.KernelState, ::CartesianIndices{…}, ::CartesianIndices{…}, ::CUDA.CuDeviceArray{…}, ::KernelFunctionOperation{…}; blocks::Int64, threads::Int64, cooperative::Bool, shmem::Int64, stream::CUDA.CuStream)
@ CUDA ~/.julia/packages/CUDA/2kjXI/lib/cudadrv/execution.jl:73
[4] launch
@ ~/.julia/packages/CUDA/2kjXI/lib/cudadrv/execution.jl:52 [inlined]
[5] #972
@ ~/.julia/packages/CUDA/2kjXI/lib/cudadrv/execution.jl:189 [inlined]
[6] macro expansion
@ ~/.julia/packages/CUDA/2kjXI/lib/cudadrv/execution.jl:149 [inlined]
[7] macro expansion
@ ./none:0 [inlined]
[8] convert_arguments
@ ./none:0 [inlined]
[9] #cudacall#971
@ ~/.julia/packages/CUDA/2kjXI/lib/cudadrv/execution.jl:191 [inlined]
[10] cudacall
@ ~/.julia/packages/CUDA/2kjXI/lib/cudadrv/execution.jl:187 [inlined]
[11] macro expansion
@ ~/.julia/packages/CUDA/2kjXI/src/compiler/execution.jl:279 [inlined]
[12] macro expansion
@ ./none:0 [inlined]
[13] (::CUDA.HostKernel{…})(::typeof(identity), ::typeof(min), ::Nothing, ::CartesianIndices{…}, ::CartesianIndices{…}, ::Val{…}, ::CUDA.CuDeviceArray{…}, ::KernelFunctionOperation{…}; convert::Val{…}, call_kwargs::@Kwargs{…})
@ CUDA ./none:0
[14] AbstractKernel
@ ./none:0 [inlined]
[15] macro expansion
@ ~/.julia/packages/CUDA/2kjXI/src/compiler/execution.jl:114 [inlined]
[16] mapreducedim!(f::typeof(identity), op::typeof(min), R::SubArray{…}, A::KernelFunctionOperation{…}; init::Nothing)
@ CUDA ~/.julia/packages/CUDA/2kjXI/src/mapreduce.jl:271
[17] mapreducedim!(f::typeof(identity), op::typeof(min), R::SubArray{…}, A::KernelFunctionOperation{…})
@ CUDA ~/.julia/packages/CUDA/2kjXI/src/mapreduce.jl:169
[18] mapreducedim!(f::Function, op::Function, R::SubArray{…}, A::KernelFunctionOperation{…})
@ GPUArrays ~/.julia/packages/GPUArrays/qt4ax/src/host/mapreduce.jl:10
[19] minimum!(f::Function, r::SubArray{…}, A::KernelFunctionOperation{…}; init::Bool)
@ Base ./reducedim.jl:1036
[20] minimum!(f::Function, r::Field{…}, a::KernelFunctionOperation{…}; condition::Nothing, mask::Float64, kwargs::@Kwargs{…})
@ Oceananigans.Fields ~/atdepth/Oceananigans.jl/src/Fields/field.jl:676
[21] minimum(f::Function, c::KernelFunctionOperation{Center, Center, Center, LatitudeLongitudeGrid{…}, Float32, typeof(cell_advection_timescaleᶜᶜᶜ), Tuple{…}}; condition::Nothing, mask::Float64, dims::Function)
@ Oceananigans.Fields ~/atdepth/Oceananigans.jl/src/Fields/field.jl:706
[22] minimum
@ ~/atdepth/Oceananigans.jl/src/Fields/field.jl:695 [inlined]
[23] minimum(c::KernelFunctionOperation{Center, Center, Center, LatitudeLongitudeGrid{…}, Float32, typeof(cell_advection_timescaleᶜᶜᶜ), Tuple{…}})
@ Oceananigans.Fields ~/atdepth/Oceananigans.jl/src/Fields/field.jl:715
[24] top-level scope
@ REPL[7]:1
caused by: CUDA error: too many resources requested for launch (code 701, ERROR_LAUNCH_OUT_OF_RESOURCES)
Stacktrace:
[1] throw_api_error(res::CUDA.cudaError_enum)
@ CUDA ~/.julia/packages/CUDA/2kjXI/lib/cudadrv/libcuda.jl:30
[2] check
@ ~/.julia/packages/CUDA/2kjXI/lib/cudadrv/libcuda.jl:37 [inlined]
[3] cuLaunchKernel
@ ~/.julia/packages/CUDA/2kjXI/lib/utils/call.jl:34 [inlined]
[4] (::CUDA.var"#966#967"{Bool, Int64, CUDA.CuStream, CUDA.CuFunction, CUDA.CuDim3, CUDA.CuDim3})(kernelParams::Vector{Ptr{Nothing}})
@ CUDA ~/.julia/packages/CUDA/2kjXI/lib/cudadrv/execution.jl:66
[5] macro expansion
@ ~/.julia/packages/CUDA/2kjXI/lib/cudadrv/execution.jl:33 [inlined]
[6] macro expansion
@ ./none:0 [inlined]
[7] pack_arguments(::CUDA.var"#966#967"{…}, ::CUDA.KernelState, ::CartesianIndices{…}, ::CartesianIndices{…}, ::CUDA.CuDeviceArray{…}, ::KernelFunctionOperation{…})
@ CUDA ./none:0
[8] launch(::CUDA.CuFunction, ::CUDA.KernelState, ::CartesianIndices{…}, ::CartesianIndices{…}, ::CUDA.CuDeviceArray{…}, ::KernelFunctionOperation{…}; blocks::Int64, threads::Int64, cooperative::Bool, shmem::Int64, stream::CUDA.CuStream)
@ CUDA ~/.julia/packages/CUDA/2kjXI/lib/cudadrv/execution.jl:59
[9] launch
@ ~/.julia/packages/CUDA/2kjXI/lib/cudadrv/execution.jl:52 [inlined]
[10] #972
@ ~/.julia/packages/CUDA/2kjXI/lib/cudadrv/execution.jl:189 [inlined]
[11] macro expansion
@ ~/.julia/packages/CUDA/2kjXI/lib/cudadrv/execution.jl:149 [inlined]
[12] macro expansion
@ ./none:0 [inlined]
[13] convert_arguments
@ ./none:0 [inlined]
[14] #cudacall#971
@ ~/.julia/packages/CUDA/2kjXI/lib/cudadrv/execution.jl:191 [inlined]
[15] cudacall
@ ~/.julia/packages/CUDA/2kjXI/lib/cudadrv/execution.jl:187 [inlined]
[16] macro expansion
@ ~/.julia/packages/CUDA/2kjXI/src/compiler/execution.jl:279 [inlined]
[17] macro expansion
@ ./none:0 [inlined]
[18] (::CUDA.HostKernel{…})(::typeof(identity), ::typeof(min), ::Nothing, ::CartesianIndices{…}, ::CartesianIndices{…}, ::Val{…}, ::CUDA.CuDeviceArray{…}, ::KernelFunctionOperation{…}; convert::Val{…}, call_kwargs::@Kwargs{…})
@ CUDA ./none:0
[19] AbstractKernel
@ ./none:0 [inlined]
[20] macro expansion
@ ~/.julia/packages/CUDA/2kjXI/src/compiler/execution.jl:114 [inlined]
[21] mapreducedim!(f::typeof(identity), op::typeof(min), R::SubArray{…}, A::KernelFunctionOperation{…}; init::Nothing)
@ CUDA ~/.julia/packages/CUDA/2kjXI/src/mapreduce.jl:271
[22] mapreducedim!(f::typeof(identity), op::typeof(min), R::SubArray{…}, A::KernelFunctionOperation{…})
@ CUDA ~/.julia/packages/CUDA/2kjXI/src/mapreduce.jl:169
[23] mapreducedim!(f::Function, op::Function, R::SubArray{…}, A::KernelFunctionOperation{…})
@ GPUArrays ~/.julia/packages/GPUArrays/qt4ax/src/host/mapreduce.jl:10
[24] minimum!(f::Function, r::SubArray{…}, A::KernelFunctionOperation{…}; init::Bool)
@ Base ./reducedim.jl:1036
[25] minimum!(f::Function, r::Field{…}, a::KernelFunctionOperation{…}; condition::Nothing, mask::Float64, kwargs::@Kwargs{…})
@ Oceananigans.Fields ~/atdepth/Oceananigans.jl/src/Fields/field.jl:676
[26] minimum(f::Function, c::KernelFunctionOperation{Center, Center, Center, LatitudeLongitudeGrid{…}, Float32, typeof(cell_advection_timescaleᶜᶜᶜ), Tuple{…}}; condition::Nothing, mask::Float64, dims::Function)
@ Oceananigans.Fields ~/atdepth/Oceananigans.jl/src/Fields/field.jl:706
[27] minimum
@ ~/atdepth/Oceananigans.jl/src/Fields/field.jl:695 [inlined]
[28] minimum(c::KernelFunctionOperation{Center, Center, Center, LatitudeLongitudeGrid{…}, Float32, typeof(cell_advection_timescaleᶜᶜᶜ), Tuple{…}})
@ Oceananigans.Fields ~/atdepth/Oceananigans.jl/src/Fields/field.jl:715
[29] top-level scope
@ REPL[7]:1
Some type information was truncated. Use `show(err)` to see complete types.
Environment: Oceananigans.jl main
branch.
julia> versioninfo()
Julia Version 1.10.7
Commit 4976d05258e (2024-11-26 15:57 UTC)
Build Info:
Official https://julialang.org/ release
Platform Info:
OS: Linux (x86_64-linux-gnu)
CPU: 48 × AMD Ryzen Threadripper 7960X 24-Cores
WORD_SIZE: 64
LIBM: libopenlibm
LLVM: libLLVM-15.0.7 (ORCJIT, znver3)
Threads: 16 default, 0 interactive, 8 GC (on 48 virtual cores)
Environment:
LD_PRELOAD = /usr/NX/lib/libnxegl.so
julia> CUDA.versioninfo()
CUDA runtime 12.6, artifact installation
CUDA driver 12.7
NVIDIA driver 565.77.0
CUDA libraries:
- CUBLAS: 12.6.4
- CURAND: 10.3.7
- CUFFT: 11.3.0
- CUSOLVER: 11.7.1
- CUSPARSE: 12.5.4
- CUPTI: 2024.3.2 (API 24.0.0)
- NVML: 12.0.0+565.77
Julia packages:
- CUDA: 5.5.2
- CUDA_Driver_jll: 0.10.4+0
- CUDA_Runtime_jll: 0.15.5+0
Toolchain:
- Julia: 1.10.7
- LLVM: 15.0.7
1 device:
0: NVIDIA GeForce RTX 4090 (sm_89, 19.505 GiB / 23.988 GiB available)