@@ -211,6 +211,19 @@ def dpu_operator_start(client: K8sClient, repo: Optional[str]) -> None:
211
211
client .oc_run_or_die ("wait --for=condition=Ready pod --all -n openshift-dpu-operator --timeout=5m" )
212
212
213
213
214
+ def ensure_vsp_ds_running (client : K8sClient ) -> None :
215
+ retries = 10
216
+ for _ in range (retries ):
217
+ desired_pods = int (client .oc_run_or_die ("get ds vsp -o jsonpath='{.status.desiredNumberScheduled}'" ).out )
218
+ available_pods = int (client .oc_run_or_die ("get ds vsp -o jsonpath='{.status.numberAvailable}'" ).out )
219
+ if available_pods != desired_pods :
220
+ logger .info (f"Waiting for VSP ds to scale up. Desired pods: { desired_pods } Available pods: { available_pods } " )
221
+ time .sleep (10 )
222
+ else :
223
+ break
224
+ else :
225
+ logger .error_and_exit ("Failed to enable ACC connectivity" )
226
+
214
227
def ExtraConfigDpu (cc : ClustersConfig , cfg : ExtraConfigArgs , futures : dict [str , Future [Optional [host .Result ]]]) -> None :
215
228
[f .result () for (_ , f ) in futures .items ()]
216
229
logger .info ("Running post config step to start DPU operator on IPU" )
@@ -246,6 +259,7 @@ def ExtraConfigDpu(cc: ClustersConfig, cfg: ExtraConfigArgs, futures: dict[str,
246
259
vendor_plugin .start (vendor_plugin .vsp_image_name (imgReg ), client )
247
260
else :
248
261
vendor_plugin .build_push_start (lh , client , imgReg )
262
+ ensure_vsp_ds_running (client )
249
263
250
264
git_repo_setup (repo , repo_wipe = False , url = DPU_OPERATOR_REPO )
251
265
if cfg .rebuild_dpu_operators_images :
@@ -293,6 +307,7 @@ def ExtraConfigDpuHost(cc: ClustersConfig, cfg: ExtraConfigArgs, futures: dict[s
293
307
h .ssh_connect ("core" )
294
308
vendor_plugin = init_vendor_plugin (h , node .kind or "" )
295
309
vendor_plugin .build_push_start (lh , client , imgReg )
310
+ ensure_vsp_ds_running (client )
296
311
297
312
git_repo_setup (repo , repo_wipe = False , url = DPU_OPERATOR_REPO )
298
313
if cfg .rebuild_dpu_operators_images :
0 commit comments