Skip to content

Commit 5c62a91

Browse files
authored
Add fix for devices that do not have memory resources (#1139)
1 parent e94b8de commit 5c62a91

File tree

1 file changed

+12
-6
lines changed

1 file changed

+12
-6
lines changed

ucp/__init__.py

Lines changed: 12 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -83,16 +83,22 @@ def _is_mig_device(handle):
8383
for dev_idx in range(device_count):
8484
handle = pynvml.nvmlDeviceGetHandleByIndex(dev_idx)
8585

86-
# Ignore MIG devices and use rely on UCX's default for now. Increasing
87-
# `UCX_CUDA_COPY_MAX_REG_RATIO` should be thoroughly tested, as it's
88-
# not yet clear whether it would be safe to set `1.0` for those
89-
# instances too.
90-
if _is_mig_device(handle):
86+
try:
87+
total_memory = pynvml.nvmlDeviceGetMemoryInfo(handle).total
88+
except pynvml.NVMLError_NotSupported:
89+
total_memory = None
90+
91+
# Ignore MIG devices and devices with no memory resource (i.e., only
92+
# integrated CPU+GPU memory resource) and rely on UCX's default for
93+
# now. Increasing `UCX_CUDA_COPY_MAX_REG_RATIO` should be thoroughly
94+
# tested, as it's not yet clear whether it would be safe to set `1.0`
95+
# for those instances too.
96+
if _is_mig_device(handle) or total_memory is None:
9197
continue
9298

9399
try:
94100
bar1_total = pynvml.nvmlDeviceGetBAR1MemoryInfo(handle).bar1Total
95-
except pynvml.nvml.NVMLError_NotSupported:
101+
except pynvml.NVMLError_NotSupported:
96102
# Bar1 access not supported on this device, set it to
97103
# zero (always lower than device memory).
98104
bar1_total = 0

0 commit comments

Comments
 (0)