Skip to content

Commit c09440d

Browse files
committed
extraConfigDpu: start p4 pod w/ retry
Signed-off-by: Salvatore Daniele <[email protected]>
1 parent b6a9531 commit c09440d

File tree

1 file changed

+21
-8
lines changed

1 file changed

+21
-8
lines changed

extraConfigDpu.py

Lines changed: 21 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616
DPU_OPERATOR_REPO = "https://github.com/openshift/dpu-operator.git"
1717
MICROSHIFT_KUBECONFIG = "/root/kubeconfig.microshift"
1818
OSE_DOCKERFILE = "https://pkgs.devel.redhat.com/cgit/containers/dpu-operator/tree/Dockerfile?h=rhaos-4.17-rhel-9"
19+
P4_IMG = "wsfd-advnetlab239.anl.eng.bos2.dc.redhat.com:5000/intel-ipu-p4-sdk:10-9-2024"
1920

2021
KERNEL_RPMS = [
2122
"https://download-01.beak-001.prod.iad2.dc.redhat.com/brewroot/vol/rhel-9/packages/kernel/5.14.0/427.2.1.el9_4/x86_64/kernel-5.14.0-427.2.1.el9_4.x86_64.rpm",
@@ -130,6 +131,7 @@ def dpu_operator_start(client: K8sClient, repo: Optional[str]) -> None:
130131
logger.info("Waiting for all dpu operator pods to become ready")
131132
time.sleep(30)
132133
client.oc_run_or_die("wait --for=condition=Ready pod --all -n openshift-dpu-operator --timeout=5m")
134+
wait_vsp_ds_running(client)
133135

134136

135137
def wait_vsp_ds_running(client: K8sClient) -> None:
@@ -148,6 +150,23 @@ def wait_vsp_ds_running(client: K8sClient) -> None:
148150
logger.error_and_exit("Vsp pods failed to reach ready state")
149151

150152

153+
def ensure_p4_pod_running(lh: host.LocalHost, acc: host.RemoteHost, imgReg: ImageRegistry) -> None:
154+
lh.run_or_die(f"podman pull --tls-verify=false {P4_IMG}")
155+
lh.run_or_die(f"podman tag {P4_IMG} {imgReg.url()}/intel-ipu-p4-sdk:10-9-2024")
156+
lh.run_or_die(f"podman push {imgReg.url()}/intel-ipu-p4-sdk:10-9-2024")
157+
uname = acc.run("uname -r").out.strip()
158+
logger.info("Manually starting P4 container")
159+
cmd = f"podman run --network host -d --privileged --entrypoint='[\"/bin/sh\", \"-c\", \"sleep 5; sh /entrypoint.sh\"]' -v /lib/modules/{uname}:/lib/modules/{uname} -v data1:/opt/p4 {imgReg.url()}/intel-ipu-p4-sdk:10-9-2024"
160+
acc.run_or_die(cmd)
161+
# Occasionally the P4 pod fails to start
162+
while True:
163+
time.sleep(10)
164+
if "intel-ipu-p4-sdk:10-9-2024" in acc.run("podman ps").out:
165+
break
166+
logger.info("Failed to start p4 container, retrying")
167+
acc.run_or_die(cmd)
168+
169+
151170
def ExtraConfigDpu(cc: ClustersConfig, cfg: ExtraConfigArgs, futures: dict[str, Future[Optional[host.Result]]]) -> None:
152171
[f.result() for (_, f) in futures.items()]
153172
logger.info("Running post config step to start DPU operator on IPU")
@@ -171,14 +190,8 @@ def ExtraConfigDpu(cc: ClustersConfig, cfg: ExtraConfigArgs, futures: dict[str,
171190
if isinstance(vendor_plugin, IpuPlugin):
172191
# TODO: Remove when this container is properly started by the vsp
173192
# We need to manually start the p4 sdk container currently for the IPU plugin
174-
p4_img = "wsfd-advnetlab239.anl.eng.bos2.dc.redhat.com:5000/intel-ipu-p4-sdk:10-9-2024"
175-
lh.run_or_die(f"podman pull --tls-verify=false {p4_img}")
176-
lh.run_or_die(f"podman tag {p4_img} {imgReg.url()}/intel-ipu-p4-sdk:10-9-2024")
177-
lh.run_or_die(f"podman push {imgReg.url()}/intel-ipu-p4-sdk:10-9-2024")
178-
uname = acc.run("uname -r").out.strip()
179-
logger.info("Manually starting P4 container")
180-
cmd = f"podman run --network host -d --privileged --entrypoint='[\"/bin/sh\", \"-c\", \"sleep 5; sh /entrypoint.sh\"]' -v /lib/modules/{uname}:/lib/modules/{uname} -v data1:/opt/p4 {imgReg.url()}/intel-ipu-p4-sdk:10-9-2024"
181-
acc.run_or_die(cmd)
193+
ensure_p4_pod_running(lh, acc, imgReg)
194+
182195
# Build on the ACC since an aarch based server is needed for the build
183196
# (the Dockerfile needs to be fixed to allow layered multi-arch build
184197
# by removing the calls to pip)

0 commit comments

Comments
 (0)