Skip to content

Commit f49428e

Browse files
committed
Add workaround for flaky acc first boot
Sometimes after the first boot, things just don't work. If we don't run the connectivity script constantly, we run the risk of not running it in the window we need to. If we do run the connectivity script constantly, we seem to be able to break the node policy rules, such that there is no connectivity on any port no matter what we do. Instead, add a more moderate script to run ipu_port1_setup.sh, and just reboot the IMC if things have taken too long. After the first reboot, behavior of the ACC seems much more predictable Signed-off-by: Salvatore Daniele <[email protected]>
1 parent 47c2299 commit f49428e

File tree

1 file changed

+27
-6
lines changed

1 file changed

+27
-6
lines changed

ipu.py

Lines changed: 27 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -51,15 +51,35 @@ def __init__(self, config: NodeConfig, external_port: str, network_api_port: str
5151
self.network_api_port = network_api_port
5252
self.config = config
5353

54+
def _wait_for_acc_with_retry(self, acc: host.Host) -> None:
55+
# Typically if the acc booted properly it will take < 20 minutes to come up (including the 10 min sleep we do during boot)
56+
logger.info("Waiting for ACC to finish installing")
57+
timeout = 1200
58+
while True:
59+
if acc.ping():
60+
logger.info("ACC responded to ping, connecting")
61+
break
62+
time.sleep(20)
63+
timeout -= 20
64+
if timeout <= 0:
65+
logger.info("ACC has not responded in a reasonable amount of time, rebooting IMC")
66+
imc = host.RemoteHost(self.config.bmc)
67+
imc.ssh_connect(self.config.bmc_user, self.config.bmc_password)
68+
imc.run("reboot")
69+
timeout = 240
70+
71+
acc.ssh_connect("root", "redhat")
72+
logger.info(acc.run("uname -a"))
73+
5474
def _boot_iso(self, iso: str) -> None:
5575
assert self.config.ip
5676
dhcpConfig.configure_iso_network_port(self.network_api_port, self.config.ip)
5777
dhcpConfig.configure_dhcpd(self.config)
5878
self._redfish_boot_ipu(self.external_port, self.config, iso)
5979
# wait on install + reboot to complete
6080
acc = host.RemoteHost(self.config.ip)
61-
acc.ssh_connect("root", "redhat")
62-
logger.info(acc.run("uname -a"))
81+
# WA since we can't reliably expect the acc to get a dhcp lease due to https://issues.redhat.com/browse/IIC-427
82+
self._wait_for_acc_with_retry(acc)
6383
# configure_iso_network_port(self.network_api_port, self.config.ip)
6484

6585
def start(self, iso_or_image_path: str, executor: ThreadPoolExecutor) -> None:
@@ -149,11 +169,12 @@ def _prepare_imc(self, server_with_key: str) -> None:
149169
nohup sh -c '
150170
while true; do
151171
if [ -f /work/scripts/ipu_port1_setup.sh ]; then
152-
ping -c 1 -W 2 192.168.0.2
153-
if [ $? -eq 0 ]; then
172+
while [ $count -lt 20 ]; do
154173
/work/scripts/ipu_port1_setup.sh
155-
sleep 1
156-
fi
174+
count=$((count + 1))
175+
sleep $count
176+
done
177+
break
157178
else
158179
break
159180
fi

0 commit comments

Comments
 (0)