Skip to content

Commit e0c854e

Browse files
committed
extraConfigDpu: remove idpf netdev restart
In theory we should no longer need to manually cold boot the servers to ensure idpf netdevs are available. Currently there is a bug in MeV 1.8 that causes the host to reboot any time the IMC reboots, which masks this issue from occurring. If this masking is removed, we want to see failure here to signify that lifecycle management of the IPU is broken Signed-off-by: Salvatore Daniele <[email protected]>
1 parent a85c65d commit e0c854e

File tree

2 files changed

+0
-70
lines changed

2 files changed

+0
-70
lines changed

extraConfigDpu.py

Lines changed: 0 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -251,26 +251,6 @@ def ExtraConfigDpuHost(cc: ClustersConfig, cfg: ExtraConfigArgs, futures: dict[s
251251
dpu_operator_start(client, repo)
252252

253253
def helper(h: host.Host, node: NodeConfig) -> Optional[host.Result]:
254-
# Temporary workaround, remove once 4.16 installations are working
255-
logger.info("Ensuring Rhel 9.4 kernel is installed")
256-
ensure_rhel_9_4_kernel_is_installed(h)
257-
# There is a bug with the idpf driver that causes the IPU to fail to be enumerated over PCIe on boot
258-
# As a result, we will need to trigger cold boots of the node until the device is available
259-
# TODO: Remove when no longer needed
260-
retries = 3
261-
h.ssh_connect("core")
262-
ret = h.run(f"test -d /sys/class/net/{cfg.dpu_net_interface}")
263-
while ret.returncode != 0:
264-
logger.error(f"{h.hostname()} does not have a network device {cfg.dpu_net_interface} cold booting node to try to recover")
265-
h.cold_boot()
266-
logger.info("Cold boot triggered, waiting for host to reboot")
267-
time.sleep(60)
268-
h.ssh_connect("core")
269-
retries = retries - 1
270-
if retries == 0:
271-
logger.error_and_exit(f"Failed to bring up IPU net device on {h.hostname()}")
272-
ret = h.run(f"test -d /sys/class/net/{cfg.dpu_net_interface}")
273-
274254
# Label the node
275255
logger.info(f"labeling node {h.hostname()} dpu=true")
276256
client.oc_run_or_die(f"label no {e.name} dpu=true")

ipu.py

Lines changed: 0 additions & 50 deletions
Original file line numberDiff line numberDiff line change
@@ -82,7 +82,6 @@ def _boot_iso(self, iso: str) -> None:
8282
acc.ssh_connect("root", "redhat")
8383
logger.info(acc.run("uname -a"))
8484
# configure_iso_network_port(self.network_api_port, self.config.ip)
85-
self._ensure_ipu_netdevs_available()
8685

8786
def start(self, iso_or_image_path: str, executor: ThreadPoolExecutor) -> None:
8887
self.future = executor.submit(self._boot_iso, iso_or_image_path)
@@ -116,27 +115,6 @@ def helper(node: NodeConfig, iso_address: str) -> str:
116115
def post_boot(self, desired_ip_range: tuple[str, str]) -> bool:
117116
return True
118117

119-
# TODO: Remove this workaround once rebooting the IMC no longer
120-
# causes the netdevs on the IPU host to be removed
121-
def _ensure_ipu_netdevs_available(self) -> None:
122-
# This is a hack, iso_cluster deployments in general should not need to know about the x86 host they are connected to.
123-
# However, since we need to cold boot the corresponding host, for the time being, infer this from the IMC address
124-
# rather than requiring the user to provide this information.
125-
ipu_host = self._ipu_host()
126-
ipu_host.ssh_connect("core")
127-
ret = ipu_host.run("test -d /sys/class/net/ens2f0")
128-
retries = 3
129-
while ret.returncode != 0:
130-
logger.error(f"{ipu_host.hostname()} does not have a network device ens2f0 cold booting node to try to recover")
131-
ipu_host.cold_boot()
132-
logger.info("Cold boot triggered, waiting for host to reboot")
133-
time.sleep(60)
134-
ipu_host.ssh_connect("core")
135-
retries = retries - 1
136-
if retries == 0:
137-
logger.error_and_exit(f"Failed to bring up IPU net device on {ipu_host.hostname()}")
138-
ret = ipu_host.run("test -d /sys/class/net/ens2f0")
139-
140118
def _ipu_host(self) -> host.Host:
141119
def host_from_imc(imc: str) -> str:
142120
ipu_host = imc.split('-intel-ipu-imc')[0]
@@ -239,38 +217,10 @@ def _enable_acc_connectivity(self) -> None:
239217
ipu_acc.run("nmcli con mod enp0s1f0 ipv4.route-metric 0")
240218
ipu_acc.run("ip route delete default via 192.168.0.1") # remove imc default route to avoid conflict
241219
logger.info(f"{node.name} connectivity established")
242-
self.ensure_ipu_netdevs_available()
243220

244221
def post_boot(self, desired_ip_range: tuple[str, str]) -> bool:
245222
return True
246223

247-
# TODO: Remove this workaround once rebooting the IMC no longer causes the netdevs on the IPU host to be removed
248-
def ensure_ipu_netdevs_available(self) -> None:
249-
def host_from_imc(imc: str) -> str:
250-
ipu_host = imc.split('-intel-ipu-imc')[0]
251-
return ipu_host
252-
253-
node = self.config
254-
# This is a hack, iso_cluster deployments in general should not need to know about the x86 host they are connected to.
255-
# However, since we need to cold boot the corresponding host, for the time being, infer this from the IMC address
256-
# rather than requiring the user to provide this information.
257-
ipu_host_name = host_from_imc(node.bmc)
258-
ipu_host_bmc = BMC.from_bmc(ipu_host_name + "-drac.anl.eng.bos2.dc.redhat.com", "root", "calvin")
259-
ipu_host = host.Host(ipu_host_name, ipu_host_bmc)
260-
ipu_host.ssh_connect("core")
261-
ret = ipu_host.run("test -d /sys/class/net/ens2f0")
262-
retries = 3
263-
while ret.returncode != 0:
264-
logger.error(f"{ipu_host.hostname()} does not have a network device ens2f0 cold booting node to try to recover")
265-
ipu_host.cold_boot()
266-
logger.info("Cold boot triggered, waiting for host to reboot")
267-
time.sleep(60)
268-
ipu_host.ssh_connect("core")
269-
retries = retries - 1
270-
if retries == 0:
271-
logger.error_and_exit(f"Failed to bring up IPU net device on {ipu_host.hostname()}")
272-
ret = ipu_host.run("test -d /sys/class/net/ens2f0")
273-
274224

275225
class IPUBMC(BMC):
276226
def __init__(self, full_url: str, user: str = "root", password: str = "calvin"):

0 commit comments

Comments
 (0)