File tree Expand file tree Collapse file tree 1 file changed +12
-6
lines changed Expand file tree Collapse file tree 1 file changed +12
-6
lines changed Original file line number Diff line number Diff line change @@ -83,16 +83,22 @@ def _is_mig_device(handle):
83
83
for dev_idx in range (device_count ):
84
84
handle = pynvml .nvmlDeviceGetHandleByIndex (dev_idx )
85
85
86
- # Ignore MIG devices and use rely on UCX's default for now. Increasing
87
- # `UCX_CUDA_COPY_MAX_REG_RATIO` should be thoroughly tested, as it's
88
- # not yet clear whether it would be safe to set `1.0` for those
89
- # instances too.
90
- if _is_mig_device (handle ):
86
+ try :
87
+ total_memory = pynvml .nvmlDeviceGetMemoryInfo (handle ).total
88
+ except pynvml .NVMLError_NotSupported :
89
+ total_memory = None
90
+
91
+ # Ignore MIG devices and devices with no memory resource (i.e., only
92
+ # integrated CPU+GPU memory resource) and rely on UCX's default for
93
+ # now. Increasing `UCX_CUDA_COPY_MAX_REG_RATIO` should be thoroughly
94
+ # tested, as it's not yet clear whether it would be safe to set `1.0`
95
+ # for those instances too.
96
+ if _is_mig_device (handle ) or total_memory is None :
91
97
continue
92
98
93
99
try :
94
100
bar1_total = pynvml .nvmlDeviceGetBAR1MemoryInfo (handle ).bar1Total
95
- except pynvml .nvml . NVMLError_NotSupported :
101
+ except pynvml .NVMLError_NotSupported :
96
102
# Bar1 access not supported on this device, set it to
97
103
# zero (always lower than device memory).
98
104
bar1_total = 0
You can’t perform that action at this time.
0 commit comments