@@ -211,6 +211,20 @@ def dpu_operator_start(client: K8sClient, repo: Optional[str]) -> None:
211
211
client .oc_run_or_die ("wait --for=condition=Ready pod --all -n openshift-dpu-operator --timeout=5m" )
212
212
213
213
214
+ def ensure_vsp_ds_running (client : K8sClient ) -> None :
215
+ retries = 10
216
+ for _ in range (retries ):
217
+ desired_pods = int (client .oc_run_or_die ("get ds vsp -o jsonpath='{.status.desiredNumberScheduled}'" ).out )
218
+ available_pods = int (client .oc_run_or_die ("get ds vsp -o jsonpath='{.status.numberAvailable}'" ).out )
219
+ if available_pods != desired_pods :
220
+ logger .info (f"Waiting for VSP ds to scale up. Desired pods: { desired_pods } Available pods: { available_pods } " )
221
+ time .sleep (10 )
222
+ else :
223
+ break
224
+ else :
225
+ logger .error_and_exit ("Vsp pods failed to reach ready state" )
226
+
227
+
214
228
def ExtraConfigDpu (cc : ClustersConfig , cfg : ExtraConfigArgs , futures : dict [str , Future [Optional [host .Result ]]]) -> None :
215
229
[f .result () for (_ , f ) in futures .items ()]
216
230
logger .info ("Running post config step to start DPU operator on IPU" )
@@ -246,6 +260,7 @@ def ExtraConfigDpu(cc: ClustersConfig, cfg: ExtraConfigArgs, futures: dict[str,
246
260
vendor_plugin .start (vendor_plugin .vsp_image_name (imgReg ), client )
247
261
else :
248
262
vendor_plugin .build_push_start (lh , client , imgReg )
263
+ ensure_vsp_ds_running (client )
249
264
250
265
git_repo_setup (repo , repo_wipe = False , url = DPU_OPERATOR_REPO )
251
266
if cfg .rebuild_dpu_operators_images :
@@ -293,6 +308,7 @@ def ExtraConfigDpuHost(cc: ClustersConfig, cfg: ExtraConfigArgs, futures: dict[s
293
308
h .ssh_connect ("core" )
294
309
vendor_plugin = init_vendor_plugin (h , node .kind or "" )
295
310
vendor_plugin .build_push_start (lh , client , imgReg )
311
+ ensure_vsp_ds_running (client )
296
312
297
313
git_repo_setup (repo , repo_wipe = False , url = DPU_OPERATOR_REPO )
298
314
if cfg .rebuild_dpu_operators_images :
0 commit comments