Skip to content

Commit 5bdfb45

Browse files
committed
Add workaround for flaky acc first boot
Sometimes after the first boot, things just don't work. If we don't run the connectivity script constantly, we run the risk of not running it in the window we need to. If we do run the connectivity script constantly, we seem to be able to break the node policy rules, such that there is no connectivity on any port no matter what we do. Instead, add a more moderate script to run ipu_port1_setup.sh, and just reboot the IMC if things have taken too long. After the first reboot, behavior of the ACC seems much more predictable Signed-off-by: Salvatore Daniele <[email protected]>
1 parent 47c2299 commit 5bdfb45

File tree

1 file changed

+26
-6
lines changed

1 file changed

+26
-6
lines changed

ipu.py

Lines changed: 26 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,25 @@ def __init__(self, config: NodeConfig, external_port: str, network_api_port: str
5050
self.external_port = external_port
5151
self.network_api_port = network_api_port
5252
self.config = config
53+
54+
def _wait_for_acc_with_retry(self, acc: host.Host):
55+
# Typically if the acc booted properly it will take < 20 minutes to come up (including the 10 min sleep we do during boot)
56+
timeout = 1500
57+
while True:
58+
if acc.ping():
59+
logger.info("ACC responded to ping, connecting")
60+
break
61+
time.sleep(20)
62+
timeout -= 20
63+
if timeout <= 0:
64+
logger.info("ACC has not responded in a reasonable amount of time, rebooting IMC")
65+
imc = host.RemoteHost(self.config.bmc)
66+
imc.ssh_connect(self.config.bmc_user, self.config.bmc_password)
67+
imc.run("reboot")
68+
timeout = 240
69+
70+
acc.ssh_connect("root", "redhat")
71+
logger.info(acc.run("uname -a"))
5372

5473
def _boot_iso(self, iso: str) -> None:
5574
assert self.config.ip
@@ -58,8 +77,8 @@ def _boot_iso(self, iso: str) -> None:
5877
self._redfish_boot_ipu(self.external_port, self.config, iso)
5978
# wait on install + reboot to complete
6079
acc = host.RemoteHost(self.config.ip)
61-
acc.ssh_connect("root", "redhat")
62-
logger.info(acc.run("uname -a"))
80+
# WA since we can't reliably expect the acc to get a dhcp lease due to https://issues.redhat.com/browse/IIC-427
81+
self._wait_for_acc_with_retry(acc)
6382
# configure_iso_network_port(self.network_api_port, self.config.ip)
6483

6584
def start(self, iso_or_image_path: str, executor: ThreadPoolExecutor) -> None:
@@ -149,11 +168,12 @@ def _prepare_imc(self, server_with_key: str) -> None:
149168
nohup sh -c '
150169
while true; do
151170
if [ -f /work/scripts/ipu_port1_setup.sh ]; then
152-
ping -c 1 -W 2 192.168.0.2
153-
if [ $? -eq 0 ]; then
171+
while [ $count -lt 20 ]; do
154172
/work/scripts/ipu_port1_setup.sh
155-
sleep 1
156-
fi
173+
count=$((count + 1))
174+
sleep $count
175+
done
176+
break
157177
else
158178
break
159179
fi

0 commit comments

Comments
 (0)