Skip to content

Commit 5276fae

Browse files
authored
Smoke test for cuda runtime errors (#1315)
* Add test for cuda runtime errors * Add cuda exception smoke test * Move cuda runtime error to end * Move cuda runtime error to end * Address comments * Address comments
1 parent 18c5017 commit 5276fae

File tree

1 file changed

+29
-10
lines changed

1 file changed

+29
-10
lines changed

test/smoke_test/smoke_test.py

+29-10
Original file line numberDiff line numberDiff line change
@@ -55,18 +55,23 @@ def check_nightly_binaries_date(package: str) -> None:
5555
f"Expected {module['name']} to be less then {NIGHTLY_ALLOWED_DELTA} days. But its {date_m_delta}"
5656
)
5757

58+
def test_cuda_runtime_errors_captured() -> None:
59+
cuda_exception_missed=True
60+
try:
61+
torch._assert_async(torch.tensor(0, device="cuda"))
62+
torch._assert_async(torch.tensor(0 + 0j, device="cuda"))
63+
except RuntimeError as e:
64+
if re.search("CUDA", f"{e}"):
65+
print(f"Caught CUDA exception with success: {e}")
66+
cuda_exception_missed = False
67+
else:
68+
raise e
69+
if(cuda_exception_missed):
70+
raise RuntimeError( f"Expected CUDA RuntimeError but have not received!")
71+
5872
def smoke_test_cuda(package: str) -> None:
5973
if not torch.cuda.is_available() and is_cuda_system:
6074
raise RuntimeError(f"Expected CUDA {gpu_arch_ver}. However CUDA is not loaded.")
61-
if torch.cuda.is_available():
62-
if torch.version.cuda != gpu_arch_ver:
63-
raise RuntimeError(
64-
f"Wrong CUDA version. Loaded: {torch.version.cuda} Expected: {gpu_arch_ver}"
65-
)
66-
print(f"torch cuda: {torch.version.cuda}")
67-
# todo add cudnn version validation
68-
print(f"torch cudnn: {torch.backends.cudnn.version()}")
69-
print(f"cuDNN enabled? {torch.backends.cudnn.enabled}")
7075

7176
if(package == 'all' and is_cuda_system):
7277
for module in MODULES:
@@ -80,6 +85,19 @@ def smoke_test_cuda(package: str) -> None:
8085
version = imported_module._extension._check_cuda_version()
8186
print(f"{module['name']} CUDA: {version}")
8287

88+
if torch.cuda.is_available():
89+
if torch.version.cuda != gpu_arch_ver:
90+
raise RuntimeError(
91+
f"Wrong CUDA version. Loaded: {torch.version.cuda} Expected: {gpu_arch_ver}"
92+
)
93+
print(f"torch cuda: {torch.version.cuda}")
94+
# todo add cudnn version validation
95+
print(f"torch cudnn: {torch.backends.cudnn.version()}")
96+
print(f"cuDNN enabled? {torch.backends.cudnn.enabled}")
97+
98+
# This check has to be run last, since its messing up CUDA runtime
99+
test_cuda_runtime_errors_captured()
100+
83101

84102
def smoke_test_conv2d() -> None:
85103
import torch.nn as nn
@@ -128,7 +146,6 @@ def main() -> None:
128146
)
129147
options = parser.parse_args()
130148
print(f"torch: {torch.__version__}")
131-
smoke_test_cuda(options.package)
132149
smoke_test_conv2d()
133150

134151
if options.package == "all":
@@ -138,6 +155,8 @@ def main() -> None:
138155
if installation_str.find("nightly") != -1:
139156
check_nightly_binaries_date(options.package)
140157

158+
smoke_test_cuda(options.package)
159+
141160

142161
if __name__ == "__main__":
143162
main()

0 commit comments

Comments
 (0)