@@ -310,75 +310,89 @@ def ExtraConfigDpuHost(cc: ClustersConfig, cfg: ExtraConfigArgs, futures: dict[s
310
310
lh = host .LocalHost ()
311
311
client = K8sClient (cc .kubeconfig )
312
312
313
- if cfg .rebuild_dpu_operators_images :
314
- registry = build_dpu_operator_images ()
315
- else :
316
- logger .info ("Will not rebuild dpu-operator images" )
317
- registry = _ensure_local_registry_running (lh , delete_all = False )
318
- operator_image = f"{ registry } /openshift-dpu-operator/cda-dpu-operator:latest"
319
- daemon_image = f"{ registry } /openshift-dpu-operator/cda-dpu-daemon:latest"
320
-
321
- # Need to trust the registry in OCP / Microshift
322
- logger .info ("Ensuring local registry is trusted in OCP" )
323
- reglocal .ocp_trust (client , reglocal .get_local_registry_base_directory (lh ), reglocal .get_local_registry_hostname (lh ), 5000 )
324
-
325
- h = host .Host (cc .workers [0 ].node )
326
- vendor_plugin = init_vendor_plugin (h )
327
- vendor_plugin .build_and_start (lh , client , registry )
328
-
329
- start_dpu_operator (lh , client , operator_image , daemon_image )
330
- client .oc_run_or_die ("wait --for=condition=Ready pod --all -n openshift-dpu-operator --timeout=2m" )
331
-
332
- def helper (h : host .Host , node : NodeConfig ) -> Optional [host .Result ]:
333
- # Temporary workaround, remove once 4.16 installations are working
334
- logger .info ("Ensuring Rhel 9.4 kernel is installed" )
335
- ensure_rhel_9_4_kernel_is_installed (h )
336
- # There is a bug with the idpf driver that causes the IPU to fail to be enumerated over PCIe on boot
337
- # As a result, we will need to trigger cold boots of the node until the device is available
338
- # TODO: Remove when no longer needed
339
- retries = 3
340
- h .ssh_connect ("core" )
341
- ret = h .run (f"test -d /sys/class/net/{ cfg .dpu_net_interface } " )
342
- while ret .returncode != 0 :
343
- logger .error (f"{ h .hostname ()} does not have a network device { cfg .dpu_net_interface } cold booting node to try to recover" )
344
- h .cold_boot ()
345
- logger .info ("Cold boot triggered, waiting for host to reboot" )
346
- time .sleep (60 )
347
- h .ssh_connect ("core" )
348
- retries = retries - 1
349
- if retries == 0 :
350
- logger .error_and_exit (f"Failed to bring up IPU net device on { h .hostname ()} " )
351
- ret = h .run (f"test -d /sys/class/net/{ cfg .dpu_net_interface } " )
352
-
353
- # Label the node
354
- logger .info (f"labeling node { h .hostname ()} dpu=true" )
355
- client .oc_run_or_die (f"label no { e .name } dpu=true" )
356
- return None
357
-
358
- executor = ThreadPoolExecutor (max_workers = len (cc .workers ))
359
- f = []
360
- # Assuming that all workers have a DPU
361
- for e in cc .workers :
362
- logger .info (f"Calling helper function for node { e .node } " )
363
- bmc = host .BMC .from_bmc (e .bmc , e .bmc_user , e .bmc_password )
364
- h = host .Host (e .node , bmc )
365
- f .append (executor .submit (helper , h , e ))
366
-
367
- for thread in f :
368
- logger .info (thread .result ())
369
-
370
- logger .info ("Verified idpf is providing net-devs on DPU worker nodes" )
371
-
372
- # Create host nad
373
- # TODO: Remove when this is automatically created by the dpu operator
374
- logger .info ("Creating dpu NAD" )
375
- client .oc ("delete -f manifests/dpu/dpu_nad.yaml" )
376
- client .oc_run_or_die ("create -f manifests/dpu/dpu_nad.yaml" )
377
- # Deploy dpu daemon and wait for dpu pods to come up
378
- logger .info ("Creating dpu operator config" )
379
- client .oc_run_or_die (f"create -f { REPO_DIR } /examples/dpu.yaml" )
380
- time .sleep (30 )
381
- client .oc_run_or_die ("wait --for=condition=Ready pod --all -n openshift-dpu-operator --timeout=5m" )
313
+ # if cfg.rebuild_dpu_operators_images:
314
+ # registry = build_dpu_operator_images()
315
+ # else:
316
+ # logger.info("Will not rebuild dpu-operator images")
317
+ # registry = _ensure_local_registry_running(lh, delete_all=False)
318
+ # operator_image = f"{registry}/openshift-dpu-operator/cda-dpu-operator:latest"
319
+ # daemon_image = f"{registry}/openshift-dpu-operator/cda-dpu-daemon:latest"
320
+
321
+ # # Need to trust the registry in OCP / Microshift
322
+ # logger.info("Ensuring local registry is trusted in OCP")
323
+ # reglocal.ocp_trust(client, reglocal.get_local_registry_base_directory(lh), reglocal.get_local_registry_hostname(lh), 5000)
324
+
325
+
326
+ logger .info ("creating test container" )
327
+ image = "alpine:latest"
328
+ name = "ipu_host_test"
329
+ cmd = f"podman pull { image } "
330
+ lh .run_or_die (cmd )
331
+ cmd = f"podman run -d --name { name } { image } sh -c 'while true; do sleep 1; done'"
332
+ lh .run_or_die (cmd )
333
+
334
+ CONTAINER_NAME = "local-container-registry"
335
+ #cmd = f"podman run -d --name {CONTAINER_NAME} -p 5000:5000 -v /root/.local-container-registry/data:/var/lib/registry:z -v /root/.local-container-registry/auth:/auth:z -v /root/.local-container-registry/certs:/certs:z -e REGISTRY_HTTP_TLS_CERTIFICATE=/certs/domain.crt -e REGISTRY_HTTP_TLS_KEY=/certs/domain.key -e REGISTRY_COMPATIBILITY_SCHEMA1_ENABLED=true --annotation=LOCAL_CONTAINER_REGISTRY_HOSTNAME=wsfd-advnetlab217.anl.eng.bos2.dc.redhat.com docker.io/library/registry:latest"
336
+ cmd = f"podman run -d --name { CONTAINER_NAME } docker.io/library/registry:latest"
337
+ lh .run_or_die (cmd )
338
+
339
+ # h = host.Host(cc.workers[0].node)
340
+ # vendor_plugin = init_vendor_plugin(h)
341
+ # vendor_plugin.build_and_start(lh, client, registry)
342
+
343
+ # start_dpu_operator(lh, client, operator_image, daemon_image)
344
+ # client.oc_run_or_die("wait --for=condition=Ready pod --all -n openshift-dpu-operator --timeout=2m")
345
+
346
+ # def helper(h: host.Host, node: NodeConfig) -> Optional[host.Result]:
347
+ # # Temporary workaround, remove once 4.16 installations are working
348
+ # logger.info("Ensuring Rhel 9.4 kernel is installed")
349
+ # ensure_rhel_9_4_kernel_is_installed(h)
350
+ # # There is a bug with the idpf driver that causes the IPU to fail to be enumerated over PCIe on boot
351
+ # # As a result, we will need to trigger cold boots of the node until the device is available
352
+ # # TODO: Remove when no longer needed
353
+ # retries = 3
354
+ # h.ssh_connect("core")
355
+ # ret = h.run(f"test -d /sys/class/net/{cfg.dpu_net_interface}")
356
+ # while ret.returncode != 0:
357
+ # logger.error(f"{h.hostname()} does not have a network device {cfg.dpu_net_interface} cold booting node to try to recover")
358
+ # h.cold_boot()
359
+ # logger.info("Cold boot triggered, waiting for host to reboot")
360
+ # time.sleep(60)
361
+ # h.ssh_connect("core")
362
+ # retries = retries - 1
363
+ # if retries == 0:
364
+ # logger.error_and_exit(f"Failed to bring up IPU net device on {h.hostname()}")
365
+ # ret = h.run(f"test -d /sys/class/net/{cfg.dpu_net_interface}")
366
+
367
+ # # Label the node
368
+ # logger.info(f"labeling node {h.hostname()} dpu=true")
369
+ # client.oc_run_or_die(f"label no {e.name} dpu=true")
370
+ # return None
371
+
372
+ # executor = ThreadPoolExecutor(max_workers=len(cc.workers))
373
+ # f = []
374
+ # # Assuming that all workers have a DPU
375
+ # for e in cc.workers:
376
+ # logger.info(f"Calling helper function for node {e.node}")
377
+ # bmc = host.BMC.from_bmc(e.bmc, e.bmc_user, e.bmc_password)
378
+ # h = host.Host(e.node, bmc)
379
+ # f.append(executor.submit(helper, h, e))
380
+
381
+ # for thread in f:
382
+ # logger.info(thread.result())
383
+
384
+ # logger.info("Verified idpf is providing net-devs on DPU worker nodes")
385
+
386
+ # # Create host nad
387
+ # # TODO: Remove when this is automatically created by the dpu operator
388
+ # logger.info("Creating dpu NAD")
389
+ # client.oc("delete -f manifests/dpu/dpu_nad.yaml")
390
+ # client.oc_run_or_die("create -f manifests/dpu/dpu_nad.yaml")
391
+ # # Deploy dpu daemon and wait for dpu pods to come up
392
+ # logger.info("Creating dpu operator config")
393
+ # client.oc_run_or_die(f"create -f {REPO_DIR}/examples/dpu.yaml")
394
+ # time.sleep(30)
395
+ # client.oc_run_or_die("wait --for=condition=Ready pod --all -n openshift-dpu-operator --timeout=5m")
382
396
logger .info ("Finished setting up dpu operator on host" )
383
397
384
398
0 commit comments