extraConfigDpu: remove idpf netdev restart

SalDaniele · SalDaniele · commit e0c854ea84cc · 2024-10-18T17:20:37.000-04:00
In theory we should no longer need to manually cold boot the servers to
ensure idpf netdevs are available.

Currently there is a bug in MeV 1.8 that causes the host to reboot any
time the IMC reboots, which masks this issue from occurring.

If this masking is removed, we want to see failure here to signify that
lifecycle management of the IPU is broken

Signed-off-by: Salvatore Daniele &lt;sdaniele@redhat.com&gt;
diff --git a/extraConfigDpu.py b/extraConfigDpu.py
@@ -251,26 +251,6 @@ def ExtraConfigDpuHost(cc: ClustersConfig, cfg: ExtraConfigArgs, futures: dict[s
     dpu_operator_start(client, repo)
 
     def helper(h: host.Host, node: NodeConfig) -> Optional[host.Result]:
-        # Temporary workaround, remove once 4.16 installations are working
-        logger.info("Ensuring Rhel 9.4 kernel is installed")
-        ensure_rhel_9_4_kernel_is_installed(h)
-        # There is a bug with the idpf driver that causes the IPU to fail to be enumerated over PCIe on boot
-        # As a result, we will need to trigger cold boots of the node until the device is available
-        # TODO: Remove when no longer needed
-        retries = 3
-        h.ssh_connect("core")
-        ret = h.run(f"test -d /sys/class/net/{cfg.dpu_net_interface}")
-        while ret.returncode != 0:
-            logger.error(f"{h.hostname()} does not have a network device {cfg.dpu_net_interface} cold booting node to try to recover")
-            h.cold_boot()
-            logger.info("Cold boot triggered, waiting for host to reboot")
-            time.sleep(60)
-            h.ssh_connect("core")
-            retries = retries - 1
-            if retries == 0:
-                logger.error_and_exit(f"Failed to bring up IPU net device on {h.hostname()}")
-            ret = h.run(f"test -d /sys/class/net/{cfg.dpu_net_interface}")
-
         # Label the node
         logger.info(f"labeling node {h.hostname()} dpu=true")
         client.oc_run_or_die(f"label no {e.name} dpu=true")
diff --git a/ipu.py b/ipu.py
@@ -82,7 +82,6 @@ def _boot_iso(self, iso: str) -> None:
         acc.ssh_connect("root", "redhat")
         logger.info(acc.run("uname -a"))
         # configure_iso_network_port(self.network_api_port, self.config.ip)
-        self._ensure_ipu_netdevs_available()
 
     def start(self, iso_or_image_path: str, executor: ThreadPoolExecutor) -> None:
         self.future = executor.submit(self._boot_iso, iso_or_image_path)
@@ -116,27 +115,6 @@ def helper(node: NodeConfig, iso_address: str) -> str:
     def post_boot(self, desired_ip_range: tuple[str, str]) -> bool:
         return True
 
-    # TODO: Remove this workaround once rebooting the IMC no longer
-    # causes the netdevs on the IPU host to be removed
-    def _ensure_ipu_netdevs_available(self) -> None:
-        # This is a hack, iso_cluster deployments in general should not need to know about the x86 host they are connected to.
-        # However, since we need to cold boot the corresponding host, for the time being, infer this from the IMC address
-        # rather than requiring the user to provide this information.
-        ipu_host = self._ipu_host()
-        ipu_host.ssh_connect("core")
-        ret = ipu_host.run("test -d /sys/class/net/ens2f0")
-        retries = 3
-        while ret.returncode != 0:
-            logger.error(f"{ipu_host.hostname()} does not have a network device ens2f0 cold booting node to try to recover")
-            ipu_host.cold_boot()
-            logger.info("Cold boot triggered, waiting for host to reboot")
-            time.sleep(60)
-            ipu_host.ssh_connect("core")
-            retries = retries - 1
-            if retries == 0:
-                logger.error_and_exit(f"Failed to bring up IPU net device on {ipu_host.hostname()}")
-            ret = ipu_host.run("test -d /sys/class/net/ens2f0")
-
     def _ipu_host(self) -> host.Host:
         def host_from_imc(imc: str) -> str:
             ipu_host = imc.split('-intel-ipu-imc')[0]
@@ -239,38 +217,10 @@ def _enable_acc_connectivity(self) -> None:
         ipu_acc.run("nmcli con mod enp0s1f0 ipv4.route-metric 0")
         ipu_acc.run("ip route delete default via 192.168.0.1")  # remove imc default route to avoid conflict
         logger.info(f"{node.name} connectivity established")
-        self.ensure_ipu_netdevs_available()
 
     def post_boot(self, desired_ip_range: tuple[str, str]) -> bool:
         return True
 
-    # TODO: Remove this workaround once rebooting the IMC no longer causes the netdevs on the IPU host to be removed
-    def ensure_ipu_netdevs_available(self) -> None:
-        def host_from_imc(imc: str) -> str:
-            ipu_host = imc.split('-intel-ipu-imc')[0]
-            return ipu_host
-
-        node = self.config
-        # This is a hack, iso_cluster deployments in general should not need to know about the x86 host they are connected to.
-        # However, since we need to cold boot the corresponding host, for the time being, infer this from the IMC address
-        # rather than requiring the user to provide this information.
-        ipu_host_name = host_from_imc(node.bmc)
-        ipu_host_bmc = BMC.from_bmc(ipu_host_name + "-drac.anl.eng.bos2.dc.redhat.com", "root", "calvin")
-        ipu_host = host.Host(ipu_host_name, ipu_host_bmc)
-        ipu_host.ssh_connect("core")
-        ret = ipu_host.run("test -d /sys/class/net/ens2f0")
-        retries = 3
-        while ret.returncode != 0:
-            logger.error(f"{ipu_host.hostname()} does not have a network device ens2f0 cold booting node to try to recover")
-            ipu_host.cold_boot()
-            logger.info("Cold boot triggered, waiting for host to reboot")
-            time.sleep(60)
-            ipu_host.ssh_connect("core")
-            retries = retries - 1
-            if retries == 0:
-                logger.error_and_exit(f"Failed to bring up IPU net device on {ipu_host.hostname()}")
-            ret = ipu_host.run("test -d /sys/class/net/ens2f0")
-
 
 class IPUBMC(BMC):
     def __init__(self, full_url: str, user: str = "root", password: str = "calvin"):