@@ -176,6 +176,8 @@ def test_bool_datatype(self):
176
176
np .array_equal (bool_array , bool_tensor_dlpack .as_numpy ()))
177
177
178
178
def test_cuda_multi_stream (self ):
179
+ # Test that external stream syncs with the default
180
+ # and pb_tensor has proper data
179
181
s1 = torch .cuda .Stream ()
180
182
size = 5000
181
183
pytorch_tensor = torch .tensor ([0 ,0 ,0 ,0 ], device = 'cuda' )
@@ -192,11 +194,13 @@ def test_cuda_multi_stream(self):
192
194
self .assertTrue (torch .equal (pytorch_tensor_dlpack , expected_output ))
193
195
194
196
def test_cuda_non_blocking_multi_stream (self ):
195
- s1 = cp .cuda .Stream (non_blocking = True )
197
+ # Test that external non-blocking stream syncs with the default stream
198
+ # and pb_tensor has proper data
199
+ non_blocking_stream = cp .cuda .Stream (non_blocking = True )
196
200
size = 5000
197
201
cupy_tensor = cp .array ([0 ,0 ,0 ,0 ])
198
202
expected_output = cp .array ([2 ,2 ,2 ,2 ])
199
- with s1 :
203
+ with non_blocking_stream :
200
204
matrix_a = cp .random .rand (size ,size )
201
205
res = cp .matmul (matrix_a ,matrix_a )
202
206
for _ in range (1000 ):
@@ -205,16 +209,42 @@ def test_cuda_non_blocking_multi_stream(self):
205
209
206
210
pb_tensor = pb_utils .Tensor .from_dlpack ('tensor' , cupy_tensor )
207
211
# Verify that non-blocking stream has no pending jobs left
208
- self .assertTrue (s1 .done )
212
+ self .assertTrue (non_blocking_stream .done )
209
213
cupy_tensor_dlpack = cp .from_dlpack (pb_tensor )
210
214
self .assertTrue (cp .array_equal (cupy_tensor_dlpack , expected_output ))
211
215
self .assertFalse (pb_tensor .is_cpu ())
212
216
self .assertEqual (
213
217
pb_tensor .__dlpack_device__ (), cupy_tensor .__dlpack_device__ ())
214
218
215
219
def test_cuda_non_blocking_multi_gpu (self ):
220
+ # Test that pb_tensor on different device
216
221
size = 5000
217
222
expected_output = cp .array ([2 ,2 ,2 ,2 ])
223
+ expected_dlpack_device = (2 , 1 ) # DLDeviceType::kDLCUDA, device_id 1
224
+ with cp .cuda .Device (1 ):
225
+ non_blocking_stream = cp .cuda .Stream (non_blocking = True )
226
+ with non_blocking_stream :
227
+ cupy_tensor = cp .array ([0 ,0 ,0 ,0 ])
228
+ matrix_a = cp .random .rand (size ,size )
229
+ res = cp .matmul (matrix_a ,matrix_a )
230
+ for _ in range (1000 ):
231
+ res = cp .matmul (res ,matrix_a )
232
+ cupy_tensor += cp .array ([2 ,2 ,2 ,2 ])
233
+ with cp .cuda .Device (0 ):
234
+ pb_tensor = pb_utils .Tensor .from_dlpack ('tensor' , cupy_tensor )
235
+ cupy_tensor_dlpack = cp .from_dlpack (pb_tensor )
236
+
237
+ self .assertTrue (cp .array_equal (cupy_tensor_dlpack , expected_output ))
238
+ self .assertFalse (pb_tensor .is_cpu ())
239
+ self .assertEqual (pb_tensor .__dlpack_device__ (), expected_dlpack_device )
240
+ self .assertEqual (
241
+ pb_tensor .__dlpack_device__ (), cupy_tensor .__dlpack_device__ ())
242
+
243
+ def test_cuda_multi_gpu (self ):
244
+ # Test that pb_tensor on different device
245
+ size = 5000
246
+ expected_output = cp .array ([2 ,2 ,2 ,2 ])
247
+ expected_dlpack_device = (2 , 1 ) # DLDeviceType::kDLCUDA, device_id 1
218
248
with cp .cuda .Device (1 ):
219
249
cupy_tensor = cp .array ([0 ,0 ,0 ,0 ])
220
250
matrix_a = cp .random .rand (size ,size )
@@ -225,10 +255,12 @@ def test_cuda_non_blocking_multi_gpu(self):
225
255
with cp .cuda .Device (0 ):
226
256
pb_tensor = pb_utils .Tensor .from_dlpack ('tensor' , cupy_tensor )
227
257
cupy_tensor_dlpack = cp .from_dlpack (pb_tensor )
228
- self .assertTrue (cp .array_equal (cupy_tensor_dlpack , expected_output ))
229
- self .assertFalse (pb_tensor .is_cpu ())
230
- self .assertEqual (
231
- pb_tensor .__dlpack_device__ (), cupy_tensor .__dlpack_device__ ())
258
+
259
+ self .assertTrue (cp .array_equal (cupy_tensor_dlpack , expected_output ))
260
+ self .assertFalse (pb_tensor .is_cpu ())
261
+ self .assertEqual (pb_tensor .__dlpack_device__ (), expected_dlpack_device )
262
+ self .assertEqual (
263
+ pb_tensor .__dlpack_device__ (), cupy_tensor .__dlpack_device__ ())
232
264
233
265
234
266
0 commit comments