@@ -105,6 +105,36 @@ def enable_acc_connectivity(node: NodeConfig) -> None:
105
105
ipu_acc .run ("nmcli con mod enp0s1f0 ipv4.route-metric 0" )
106
106
ipu_acc .run ("ip route delete default via 192.168.0.1" ) # remove imc default route to avoid conflict
107
107
logger .info (f"{ node .name } connectivity established" )
108
+ ensure_ipu_netdevs_available (node )
109
+
110
+
111
+ # TODO: Remove this workaround once rebooting the IMC no longer causes the netdevs on the IPU host to be removed
112
+ def host_from_imc (imc : str ) -> str :
113
+ ipu_host = imc .split ('-intel-ipu-imc' )[0 ]
114
+ return ipu_host
115
+
116
+
117
+ # TODO: Remove this workaround once rebooting the IMC no longer causes the netdevs on the IPU host to be removed
118
+ def ensure_ipu_netdevs_available (node : NodeConfig ) -> None :
119
+ # This is a hack, iso_cluster deployments in general should not need to know about the x86 host they are connected to.
120
+ # However, since we need to cold boot the corresponding host, for the time being, infer this from the IMC address
121
+ # rather than requiring the user to provide this information.
122
+ ipu_host_name = host_from_imc (node .bmc )
123
+ ipu_host_bmc = host .BMC .from_bmc (ipu_host_name + "-drac.anl.eng.bos2.dc.redhat.com" , "root" , "calvin" )
124
+ ipu_host = host .Host (host_from_imc (node .bmc ), ipu_host_bmc )
125
+ ipu_host .ssh_connect ("core" )
126
+ ret = ipu_host .run ("test -d /sys/class/net/ens2f0" )
127
+ retries = 3
128
+ while ret .returncode != 0 :
129
+ logger .error (f"{ ipu_host .hostname ()} does not have a network device ens2f0 cold booting node to try to recover" )
130
+ ipu_host .cold_boot ()
131
+ logger .info ("Cold boot triggered, waiting for host to reboot" )
132
+ time .sleep (60 )
133
+ ipu_host .ssh_connect ("core" )
134
+ retries = retries - 1
135
+ if retries == 0 :
136
+ logger .error_and_exit (f"Failed to bring up IPU net device on { ipu_host .hostname ()} " )
137
+ ret = ipu_host .run ("test -d /sys/class/net/ens2f0" )
108
138
109
139
110
140
def is_http_url (url : str ) -> bool :
0 commit comments