Skip to content

Commit 1282817

Browse files
committed
isoCluster: Ensure IPU netdevs on host are up
Currently, we need to cold boot the IPU host anytime the IMC reboots to ensure the netdevs are available. Ensure the netdevs are available when isocluster deployment completes. Signed-off-by: Salvatore Daniele <[email protected]>
1 parent da2f9a2 commit 1282817

File tree

1 file changed

+30
-0
lines changed

1 file changed

+30
-0
lines changed

isoCluster.py

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -105,6 +105,36 @@ def enable_acc_connectivity(node: NodeConfig) -> None:
105105
ipu_acc.run("nmcli con mod enp0s1f0 ipv4.route-metric 0")
106106
ipu_acc.run("ip route delete default via 192.168.0.1") # remove imc default route to avoid conflict
107107
logger.info(f"{node.name} connectivity established")
108+
ensure_ipu_netdevs_available(node)
109+
110+
111+
#TODO: Remove this workaround once rebooting the IMC no longer causes the netdevs on the IPU host to be removed
112+
def host_from_imc(imc: str) -> str:
113+
ipu_host = imc.split('-intel-ipu-imc')[0]
114+
return ipu_host
115+
116+
117+
#TODO: Remove this workaround once rebooting the IMC no longer causes the netdevs on the IPU host to be removed
118+
def ensure_ipu_netdevs_available(node: NodeConfig) -> None:
119+
# This is a hack, iso_cluster deployments in general should not need to know about the x86 host they are connected to.
120+
# However, since we need to cold boot the corresponding host, for the time being, infer this from the IMC address
121+
# rather than requiring the user to provide this information.
122+
ipu_host_name = host_from_imc(node.bmc)
123+
ipu_host_bmc = host.BMC.from_bmc(ipu_host_name+"-drac.anl.eng.bos2.dc.redhat.com", "root", "calvin")
124+
ipu_host = host.Host(host_from_imc(node.bmc), ipu_host_bmc)
125+
ipu_host.ssh_connect("core")
126+
ret = ipu_host.run(f"test -d /sys/class/net/ens2f0")
127+
retries = 3
128+
while ret.returncode != 0:
129+
logger.error(f"{ipu_host.hostname()} does not have a network device ens2f0 cold booting node to try to recover")
130+
ipu_host.cold_boot()
131+
logger.info("Cold boot triggered, waiting for host to reboot")
132+
time.sleep(60)
133+
ipu_host.ssh_connect("core")
134+
retries = retries - 1
135+
if retries == 0:
136+
logger.error_and_exit(f"Failed to bring up IPU net device on {ipu_host.hostname()}")
137+
ret = ipu_host.run(f"test -d /sys/class/net/ens2f0")
108138

109139

110140
def is_http_url(url: str) -> bool:

0 commit comments

Comments
 (0)