Skip to content

Commit 128d407

Browse files
committed
vast CI add GPU utilization checks
1 parent 7ef05b4 commit 128d407

File tree

1 file changed

+37
-5
lines changed

1 file changed

+37
-5
lines changed

.github/workflows/vast/provision.sh

Lines changed: 37 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -131,7 +131,8 @@ while true; do
131131
echo "host_id: $HOST_ID"
132132

133133
if [[ -z "$INSTANCE_ID" ]]; then
134-
echo "No valid instance found"
134+
echo "No offer found"
135+
echo "retrying in $RECREATE_INTERVAL seconds..."
135136
sleep $RECREATE_INTERVAL
136137
continue
137138
fi
@@ -148,7 +149,8 @@ while true; do
148149
INSTANCE_ID=$(printf "%s\n" "$RESULT" | jq -r '.new_contract')
149150

150151
if [[ -z "$INSTANCE_ID" ]]; then
151-
echo "No valid instance found"
152+
echo "Creation response is empty."
153+
echo "retrying in $RETRY_INTERVAL seconds..."
152154
sleep $RETRY_INTERVAL
153155
continue
154156
fi
@@ -161,8 +163,9 @@ while true; do
161163
echo "new INSTANCE_ID: $INSTANCE_ID"
162164
else
163165
echo "success: $success"
164-
echo "instance creation failed."
166+
echo "Creation failed."
165167
$WORKDIR/delete-instance.sh
168+
echo "retrying in $RETRY_INTERVAL seconds..."
166169
sleep $RETRY_INTERVAL
167170
continue
168171
fi
@@ -238,9 +241,12 @@ while true; do
238241
continue
239242
fi
240243

244+
# check driver version
245+
echo "==== check driver version ===="
241246
driver_version_confirmed=false
242247
driver_version=$($WORKDIR/ssh-command.sh "nvidia-smi --query-gpu=driver_version --format=csv,noheader")
243248
required_version="520"
249+
echo "driver_version: $driver_version"
244250

245251
# Compare versions
246252
if [[ "$(echo -e "$driver_version\n$required_version" | sort -V | head -n 1)" == "$required_version" ]]; then
@@ -256,10 +262,36 @@ while true; do
256262
continue
257263
fi
258264

259-
scp_command="scp -i $WORKDIR/id_ed25519 -o StrictHostKeyChecking=no -o ConnectTimeout=5 -P $port $CUDA_TESTER_PATH root@${hostname}:/tmp/"
265+
# check GPU utilization
266+
echo "==== check GPU utilization ===="
267+
GPU_UTIL=$($WORKDIR/ssh-command.sh "nvidia-smi --query-gpu=utilization.gpu --format=csv,noheader")
268+
MEM_UTIL=$($WORKDIR/ssh-command.sh "nvidia-smi --query-gpu=utilization.memory --format=csv,noheader")
269+
echo "GPU_UTIL: $GPU_UTIL"
270+
echo "MEM_UTIL: $MEM_UTIL"
271+
272+
if [ $(echo $GPU_UTIL | awk '{print $1}') -lt 5 ]; then
273+
echo "GPU utilization is less than 5%."
274+
else
275+
echo "*** GPU utilization is 5% or more. ***"
276+
$WORKDIR/delete-instance.sh
277+
continue
278+
fi
279+
280+
if [ $(echo $MEM_UTIL | awk '{print $1}') -lt 5 ]; then
281+
echo "Memory utilization is less than 5%."
282+
else
283+
echo "*** Memory utilization is 5% or more. ***"
284+
$WORKDIR/delete-instance.sh
285+
continue
286+
fi
287+
288+
# copy cuda-tester.cu
260289
echo "==== copy cuda-tester.cu ======"
290+
scp_command="scp -i $WORKDIR/id_ed25519 -o StrictHostKeyChecking=no -o ConnectTimeout=5 -P $port $CUDA_TESTER_PATH root@${hostname}:/tmp/"
261291
echo $scp_command
262292
eval $scp_command
293+
294+
# compile cuda-tester.cu
263295
echo "==== compile cuda ======"
264296
$WORKDIR/ssh-command.sh "nvcc /tmp/cuda-tester.cu -o /tmp/cuda-tester"
265297
echo "==== run cuda ======"
@@ -270,7 +302,7 @@ while true; do
270302
$WORKDIR/ssh-command.sh "apt update"
271303
break
272304
else
273-
echo "CUDA test failed"
305+
echo "*** CUDA test failed ***"
274306
$WORKDIR/delete-instance.sh
275307
continue
276308
fi

0 commit comments

Comments
 (0)