16
16
DPU_OPERATOR_REPO = "https://github.com/openshift/dpu-operator.git"
17
17
MICROSHIFT_KUBECONFIG = "/root/kubeconfig.microshift"
18
18
OSE_DOCKERFILE = "https://pkgs.devel.redhat.com/cgit/containers/dpu-operator/tree/Dockerfile?h=rhaos-4.17-rhel-9"
19
+ P4_IMG = "wsfd-advnetlab239.anl.eng.bos2.dc.redhat.com:5000/intel-ipu-p4-sdk:10-9-2024"
19
20
20
21
KERNEL_RPMS = [
21
22
"https://download-01.beak-001.prod.iad2.dc.redhat.com/brewroot/vol/rhel-9/packages/kernel/5.14.0/427.2.1.el9_4/x86_64/kernel-5.14.0-427.2.1.el9_4.x86_64.rpm" ,
@@ -130,6 +131,7 @@ def dpu_operator_start(client: K8sClient, repo: Optional[str]) -> None:
130
131
logger .info ("Waiting for all dpu operator pods to become ready" )
131
132
time .sleep (30 )
132
133
client .oc_run_or_die ("wait --for=condition=Ready pod --all -n openshift-dpu-operator --timeout=5m" )
134
+ wait_vsp_ds_running (client )
133
135
134
136
135
137
def wait_vsp_ds_running (client : K8sClient ) -> None :
@@ -148,6 +150,23 @@ def wait_vsp_ds_running(client: K8sClient) -> None:
148
150
logger .error_and_exit ("Vsp pods failed to reach ready state" )
149
151
150
152
153
+ def ensure_p4_pod_running (lh : host .Host , acc : host .Host , imgReg : ImageRegistry ) -> None :
154
+ lh .run_or_die (f"podman pull --tls-verify=false { P4_IMG } " )
155
+ lh .run_or_die (f"podman tag { P4_IMG } { imgReg .url ()} /intel-ipu-p4-sdk:10-9-2024" )
156
+ lh .run_or_die (f"podman push { imgReg .url ()} /intel-ipu-p4-sdk:10-9-2024" )
157
+ uname = acc .run ("uname -r" ).out .strip ()
158
+ logger .info ("Manually starting P4 container" )
159
+ cmd = f"podman run --network host -d --privileged --entrypoint='[\" /bin/sh\" , \" -c\" , \" sleep 5; sh /entrypoint.sh\" ]' -v /lib/modules/{ uname } :/lib/modules/{ uname } -v data1:/opt/p4 { imgReg .url ()} /intel-ipu-p4-sdk:10-9-2024"
160
+ acc .run_or_die (cmd )
161
+ # Occasionally the P4 pod fails to start
162
+ while True :
163
+ time .sleep (10 )
164
+ if "intel-ipu-p4-sdk:10-9-2024" in acc .run ("podman ps" ).out :
165
+ break
166
+ logger .info ("Failed to start p4 container, retrying" )
167
+ acc .run_or_die (cmd )
168
+
169
+
151
170
def ExtraConfigDpu (cc : ClustersConfig , cfg : ExtraConfigArgs , futures : dict [str , Future [Optional [host .Result ]]]) -> None :
152
171
[f .result () for (_ , f ) in futures .items ()]
153
172
logger .info ("Running post config step to start DPU operator on IPU" )
@@ -171,14 +190,8 @@ def ExtraConfigDpu(cc: ClustersConfig, cfg: ExtraConfigArgs, futures: dict[str,
171
190
if isinstance (vendor_plugin , IpuPlugin ):
172
191
# TODO: Remove when this container is properly started by the vsp
173
192
# We need to manually start the p4 sdk container currently for the IPU plugin
174
- p4_img = "wsfd-advnetlab239.anl.eng.bos2.dc.redhat.com:5000/intel-ipu-p4-sdk:10-9-2024"
175
- lh .run_or_die (f"podman pull --tls-verify=false { p4_img } " )
176
- lh .run_or_die (f"podman tag { p4_img } { imgReg .url ()} /intel-ipu-p4-sdk:10-9-2024" )
177
- lh .run_or_die (f"podman push { imgReg .url ()} /intel-ipu-p4-sdk:10-9-2024" )
178
- uname = acc .run ("uname -r" ).out .strip ()
179
- logger .info ("Manually starting P4 container" )
180
- cmd = f"podman run --network host -d --privileged --entrypoint='[\" /bin/sh\" , \" -c\" , \" sleep 5; sh /entrypoint.sh\" ]' -v /lib/modules/{ uname } :/lib/modules/{ uname } -v data1:/opt/p4 { imgReg .url ()} /intel-ipu-p4-sdk:10-9-2024"
181
- acc .run_or_die (cmd )
193
+ ensure_p4_pod_running (lh , acc , imgReg )
194
+
182
195
# Build on the ACC since an aarch based server is needed for the build
183
196
# (the Dockerfile needs to be fixed to allow layered multi-arch build
184
197
# by removing the calls to pip)
0 commit comments