@@ -131,7 +131,8 @@ while true; do
131
131
echo " host_id: $HOST_ID "
132
132
133
133
if [[ -z " $INSTANCE_ID " ]]; then
134
- echo " No valid instance found"
134
+ echo " No offer found"
135
+ echo " retrying in $RECREATE_INTERVAL seconds..."
135
136
sleep $RECREATE_INTERVAL
136
137
continue
137
138
fi
@@ -148,7 +149,8 @@ while true; do
148
149
INSTANCE_ID=$( printf " %s\n" " $RESULT " | jq -r ' .new_contract' )
149
150
150
151
if [[ -z " $INSTANCE_ID " ]]; then
151
- echo " No valid instance found"
152
+ echo " Creation response is empty."
153
+ echo " retrying in $RETRY_INTERVAL seconds..."
152
154
sleep $RETRY_INTERVAL
153
155
continue
154
156
fi
@@ -161,8 +163,9 @@ while true; do
161
163
echo " new INSTANCE_ID: $INSTANCE_ID "
162
164
else
163
165
echo " success: $success "
164
- echo " instance creation failed."
166
+ echo " Creation failed."
165
167
$WORKDIR /delete-instance.sh
168
+ echo " retrying in $RETRY_INTERVAL seconds..."
166
169
sleep $RETRY_INTERVAL
167
170
continue
168
171
fi
@@ -238,9 +241,12 @@ while true; do
238
241
continue
239
242
fi
240
243
244
+ # check driver version
245
+ echo " ==== check driver version ===="
241
246
driver_version_confirmed=false
242
247
driver_version=$( $WORKDIR /ssh-command.sh " nvidia-smi --query-gpu=driver_version --format=csv,noheader" )
243
248
required_version=" 520"
249
+ echo " driver_version: $driver_version "
244
250
245
251
# Compare versions
246
252
if [[ " $( echo -e " $driver_version \n$required_version " | sort -V | head -n 1) " == " $required_version " ]]; then
@@ -256,10 +262,36 @@ while true; do
256
262
continue
257
263
fi
258
264
259
- scp_command=" scp -i $WORKDIR /id_ed25519 -o StrictHostKeyChecking=no -o ConnectTimeout=5 -P $port $CUDA_TESTER_PATH root@${hostname} :/tmp/"
265
+ # check GPU utilization
266
+ echo " ==== check GPU utilization ===="
267
+ GPU_UTIL=$( $WORKDIR /ssh-command.sh " nvidia-smi --query-gpu=utilization.gpu --format=csv,noheader" )
268
+ MEM_UTIL=$( $WORKDIR /ssh-command.sh " nvidia-smi --query-gpu=utilization.memory --format=csv,noheader" )
269
+ echo " GPU_UTIL: $GPU_UTIL "
270
+ echo " MEM_UTIL: $MEM_UTIL "
271
+
272
+ if [ $( echo $GPU_UTIL | awk ' {print $1}' ) -lt 5 ]; then
273
+ echo " GPU utilization is less than 5%."
274
+ else
275
+ echo " *** GPU utilization is 5% or more. ***"
276
+ $WORKDIR /delete-instance.sh
277
+ continue
278
+ fi
279
+
280
+ if [ $( echo $MEM_UTIL | awk ' {print $1}' ) -lt 5 ]; then
281
+ echo " Memory utilization is less than 5%."
282
+ else
283
+ echo " *** Memory utilization is 5% or more. ***"
284
+ $WORKDIR /delete-instance.sh
285
+ continue
286
+ fi
287
+
288
+ # copy cuda-tester.cu
260
289
echo " ==== copy cuda-tester.cu ======"
290
+ scp_command=" scp -i $WORKDIR /id_ed25519 -o StrictHostKeyChecking=no -o ConnectTimeout=5 -P $port $CUDA_TESTER_PATH root@${hostname} :/tmp/"
261
291
echo $scp_command
262
292
eval $scp_command
293
+
294
+ # compile cuda-tester.cu
263
295
echo " ==== compile cuda ======"
264
296
$WORKDIR /ssh-command.sh " nvcc /tmp/cuda-tester.cu -o /tmp/cuda-tester"
265
297
echo " ==== run cuda ======"
@@ -270,7 +302,7 @@ while true; do
270
302
$WORKDIR /ssh-command.sh " apt update"
271
303
break
272
304
else
273
- echo " CUDA test failed"
305
+ echo " *** CUDA test failed *** "
274
306
$WORKDIR /delete-instance.sh
275
307
continue
276
308
fi
0 commit comments