@@ -82,7 +82,6 @@ def _boot_iso(self, iso: str) -> None:
82
82
acc .ssh_connect ("root" , "redhat" )
83
83
logger .info (acc .run ("uname -a" ))
84
84
# configure_iso_network_port(self.network_api_port, self.config.ip)
85
- self ._ensure_ipu_netdevs_available ()
86
85
87
86
def start (self , iso_or_image_path : str , executor : ThreadPoolExecutor ) -> None :
88
87
self .future = executor .submit (self ._boot_iso , iso_or_image_path )
@@ -116,27 +115,6 @@ def helper(node: NodeConfig, iso_address: str) -> str:
116
115
def post_boot (self , desired_ip_range : tuple [str , str ]) -> bool :
117
116
return True
118
117
119
- # TODO: Remove this workaround once rebooting the IMC no longer
120
- # causes the netdevs on the IPU host to be removed
121
- def _ensure_ipu_netdevs_available (self ) -> None :
122
- # This is a hack, iso_cluster deployments in general should not need to know about the x86 host they are connected to.
123
- # However, since we need to cold boot the corresponding host, for the time being, infer this from the IMC address
124
- # rather than requiring the user to provide this information.
125
- ipu_host = self ._ipu_host ()
126
- ipu_host .ssh_connect ("core" )
127
- ret = ipu_host .run ("test -d /sys/class/net/ens2f0" )
128
- retries = 3
129
- while ret .returncode != 0 :
130
- logger .error (f"{ ipu_host .hostname ()} does not have a network device ens2f0 cold booting node to try to recover" )
131
- ipu_host .cold_boot ()
132
- logger .info ("Cold boot triggered, waiting for host to reboot" )
133
- time .sleep (60 )
134
- ipu_host .ssh_connect ("core" )
135
- retries = retries - 1
136
- if retries == 0 :
137
- logger .error_and_exit (f"Failed to bring up IPU net device on { ipu_host .hostname ()} " )
138
- ret = ipu_host .run ("test -d /sys/class/net/ens2f0" )
139
-
140
118
def _ipu_host (self ) -> host .Host :
141
119
def host_from_imc (imc : str ) -> str :
142
120
ipu_host = imc .split ('-intel-ipu-imc' )[0 ]
@@ -239,38 +217,10 @@ def _enable_acc_connectivity(self) -> None:
239
217
ipu_acc .run ("nmcli con mod enp0s1f0 ipv4.route-metric 0" )
240
218
ipu_acc .run ("ip route delete default via 192.168.0.1" ) # remove imc default route to avoid conflict
241
219
logger .info (f"{ node .name } connectivity established" )
242
- self .ensure_ipu_netdevs_available ()
243
220
244
221
def post_boot (self , desired_ip_range : tuple [str , str ]) -> bool :
245
222
return True
246
223
247
- # TODO: Remove this workaround once rebooting the IMC no longer causes the netdevs on the IPU host to be removed
248
- def ensure_ipu_netdevs_available (self ) -> None :
249
- def host_from_imc (imc : str ) -> str :
250
- ipu_host = imc .split ('-intel-ipu-imc' )[0 ]
251
- return ipu_host
252
-
253
- node = self .config
254
- # This is a hack, iso_cluster deployments in general should not need to know about the x86 host they are connected to.
255
- # However, since we need to cold boot the corresponding host, for the time being, infer this from the IMC address
256
- # rather than requiring the user to provide this information.
257
- ipu_host_name = host_from_imc (node .bmc )
258
- ipu_host_bmc = BMC .from_bmc (ipu_host_name + "-drac.anl.eng.bos2.dc.redhat.com" , "root" , "calvin" )
259
- ipu_host = host .Host (ipu_host_name , ipu_host_bmc )
260
- ipu_host .ssh_connect ("core" )
261
- ret = ipu_host .run ("test -d /sys/class/net/ens2f0" )
262
- retries = 3
263
- while ret .returncode != 0 :
264
- logger .error (f"{ ipu_host .hostname ()} does not have a network device ens2f0 cold booting node to try to recover" )
265
- ipu_host .cold_boot ()
266
- logger .info ("Cold boot triggered, waiting for host to reboot" )
267
- time .sleep (60 )
268
- ipu_host .ssh_connect ("core" )
269
- retries = retries - 1
270
- if retries == 0 :
271
- logger .error_and_exit (f"Failed to bring up IPU net device on { ipu_host .hostname ()} " )
272
- ret = ipu_host .run ("test -d /sys/class/net/ens2f0" )
273
-
274
224
275
225
class IPUBMC (BMC ):
276
226
def __init__ (self , full_url : str , user : str = "root" , password : str = "calvin" ):
0 commit comments