Skip to content

Commit 913df4e

Browse files
authored
[201911] Warmboot script improvements - timeout in exec and disable service-autorestart (#2149)
Below changes are made to warmboot/fastboot script: 1. Before shutdown, check docker exec on the relevant docker containers still works. 2. Disable trap handler after +e. 3. Disable services auto-restart during warm reboot. In 201811, only swss restart was needed to be extended. In 201911, most services can auto-restart, hence disabling them in the shutdown path. 4. Add timeout to make sure syncd shutdown request will return in time. 5s 5. Make sure that syncd pre-shutdown wait won't take more than 60 seconds. 6. Make sure subsequent docker exec won't stuck for long time Related 201811 PR: #1474
1 parent a12ce13 commit 913df4e

File tree

1 file changed

+56
-22
lines changed

1 file changed

+56
-22
lines changed

scripts/fast-reboot

+56-22
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,7 @@ function error()
4343
function debug()
4444
{
4545
if [[ x"${VERBOSE}" == x"yes" ]]; then
46-
echo `date` $@
46+
echo $(date) $@
4747
fi
4848
logger "$@"
4949
}
@@ -116,10 +116,10 @@ function clear_warm_boot()
116116
{
117117
common_clear
118118

119-
result=`timeout 10s config warm_restart disable; if [[ $? == 124 ]]; then echo timeout; else echo "code ($?)"; fi` || /bin/true
119+
result=$(timeout 10s config warm_restart disable; res=$?; if [[ $res == 124 ]]; then echo timeout; else echo "code ($res)"; fi) || /bin/true
120120
debug "Cancel warm-reboot: ${result}"
121121
122-
TIMESTAMP=`date +%Y%m%d-%H%M%S`
122+
TIMESTAMP=$(date +%Y%m%d-%H%M%S)
123123
if [[ -f ${WARM_DIR}/${REDIS_FILE} ]]; then
124124
mv -f ${WARM_DIR}/${REDIS_FILE} ${WARM_DIR}/${REDIS_FILE}.${TIMESTAMP} || /bin/true
125125
fi
@@ -143,7 +143,7 @@ function initialize_pre_shutdown()
143143
{
144144
debug "Initialize pre-shutdown ..."
145145
TABLE="WARM_RESTART_TABLE|warm-shutdown"
146-
RESTORE_COUNT=`/usr/bin/redis-cli -n 6 hget "${TABLE}" restore_count`
146+
RESTORE_COUNT=$(/usr/bin/redis-cli -n 6 hget "${TABLE}" restore_count)
147147
if [[ -z "$RESTORE_COUNT" ]]; then
148148
/usr/bin/redis-cli -n 6 hset "${TABLE}" "restore_count" "0" > /dev/null
149149
fi
@@ -153,9 +153,10 @@ function initialize_pre_shutdown()
153153
function request_pre_shutdown()
154154
{
155155
debug "Requesting pre-shutdown ..."
156-
/usr/bin/docker exec -i syncd /usr/bin/syncd_request_shutdown --pre &> /dev/null || {
156+
STATE=$(timeout 5s docker exec syncd /usr/bin/syncd_request_shutdown --pre &> /dev/null; if [[ $? == 124 ]]; then echo "timed out"; fi)
157+
if [[ x"${STATE}" == x"timed out" ]]; then
157158
error "Failed to request pre-shutdown"
158-
}
159+
fi
159160
}
160161
161162
function recover_issu_bank_file_instruction()
@@ -201,33 +202,31 @@ function wait_for_pre_shutdown_complete_or_fail()
201202
STATE="requesting"
202203
declare -i waitcount
203204
declare -i retrycount
204-
waitcount=0
205205
retrycount=0
206+
start_time=$SECONDS
207+
elapsed_time=$(($SECONDS - $start_time))
206208
# Wait up to 60 seconds for pre-shutdown to complete
207-
while [[ ${waitcount} -lt 600 ]]; do
209+
while [[ ${elapsed_time} -lt 60 ]]; do
208210
# timeout doesn't work with -i option of "docker exec". Therefore we have
209211
# to invoke docker exec directly below.
210-
STATE=`timeout 5s docker exec database redis-cli -n 6 hget "${TABLE}" state; if [[ $? == 124 ]]; then echo "timed out"; fi`
211-
212+
STATE=$(timeout 5s docker exec database redis-cli -n 6 hget "${TABLE}" state; if [[ $? == 124 ]]; then echo "timed out"; fi)
212213
if [[ x"${STATE}" == x"timed out" ]]; then
213-
waitcount+=50
214214
retrycount+=1
215-
debug "Timed out getting pre-shutdown state (${waitcount}) retry count ${retrycount} ..."
215+
debug "Timed out getting pre-shutdown state, retry count ${retrycount} ..."
216216
if [[ retrycount -gt 2 ]]; then
217217
break
218218
fi
219219
elif [[ x"${STATE}" != x"requesting" ]]; then
220220
break
221221
else
222222
sleep 0.1
223-
waitcount+=1
224223
fi
224+
elapsed_time=$(($SECONDS - $start_time))
225225
done
226-
227226
if [[ x"${STATE}" != x"pre-shutdown-succeeded" ]]; then
228-
debug "Syncd pre-shutdown failed: ${STATE} ..."
227+
debug "Syncd pre-shutdown failed, state: ${STATE} ..."
229228
else
230-
debug "Pre-shutdown succeeded ..."
229+
debug "Pre-shutdown succeeded, state: ${STATE} ..."
231230
fi
232231
}
233232
@@ -248,7 +247,10 @@ function backup_database()
248247
" 0 > /dev/null
249248
redis-cli save > /dev/null
250249
docker cp database:/var/lib/redis/$REDIS_FILE $WARM_DIR
251-
docker exec -i database rm /var/lib/redis/$REDIS_FILE
250+
STATE=$(timeout 5s docker exec database rm /var/lib/redis/$REDIS_FILE; if [[ $? == 124 ]]; then echo "timed out"; fi)
251+
if [[ x"${STATE}" == x"timed out" ]]; then
252+
error "Timedout during attempting to remove redis dump file from database container"
253+
fi
252254
}
253255
254256
function setup_control_plane_assistant()
@@ -289,10 +291,23 @@ function setup_reboot_variables()
289291
INITRD=$(echo $KERNEL_IMAGE | sed 's/vmlinuz/initrd.img/g')
290292
}
291293
294+
function check_docker_exec()
295+
{
296+
containers="radv bgp lldp swss database teamd syncd"
297+
for container in $containers; do
298+
STATE=$(timeout 1s docker exec $container echo "success"; if [[ $? == 124 ]]; then echo "timed out"; fi)
299+
if [[ x"${STATE}" == x"timed out" ]]; then
300+
error "Docker exec on $container timedout"
301+
exit "${EXIT_FAILURE}"
302+
fi
303+
done
304+
}
305+
292306
function reboot_pre_check()
293307
{
308+
check_docker_exec
294309
# Make sure that the file system is normal: read-write able
295-
filename="/host/test-`date +%Y%m%d-%H%M%S`"
310+
filename="/host/test-$(date +%Y%m%d-%H%M%S)"
296311
if [[ ! -f ${filename} ]]; then
297312
touch ${filename}
298313
fi
@@ -456,10 +471,21 @@ if [[ "$REBOOT_TYPE" = "warm-reboot" || "$REBOOT_TYPE" = "fastfast-reboot" ]]; t
456471
fi
457472
fi
458473
459-
# We are fully committed to reboot from this point on becasue critical
474+
# We are fully committed to reboot from this point on because critical
460475
# service will go down and we cannot recover from it.
461476
set +e
462477
478+
# disable trap-handlers which were set before
479+
trap '' EXIT HUP INT QUIT TERM KILL ABRT ALRM
480+
481+
# "systemctl stop <service>" is expected to prevent service/containers from
482+
# restarting automatically. However, in some rare cases, systemctl stop doesn't
483+
# work as expected, and services can still auto-restart after RestartSec timer expires
484+
# Therefore, as a preventive measure, explicitly disable service auto-restart in the shutdown path.
485+
debug "Disabling auto-restart for services ..."
486+
grep -l "Restart=always" /usr/lib/systemd/system/*.service | xargs sed -i -e "s/\<Restart=always\>/Restart=no/"
487+
systemctl daemon-reload
488+
463489
# Kill nat docker after saving the conntrack table
464490
debug "Stopping nat ..."
465491
/usr/bin/dump_nat_entries.py
@@ -544,8 +570,12 @@ if [[ "$REBOOT_TYPE" = "warm-reboot" || "$REBOOT_TYPE" = "fastfast-reboot" ]]; t
544570
# Send USR1 signal to all teamd instances to stop them
545571
# It will prepare teamd for warm-reboot
546572
# Note: We must send USR1 signal before syncd, because it will send the last packet through CPU port
547-
docker exec -i teamd pkill -USR1 teamd > /dev/null || [ $? == 1 ]
548-
debug "Stopped teamd ..."
573+
STATE=$(timeout 5s docker exec teamd pkill -USR1 teamd; if [[ $? == 124 ]]; then echo "timed out"; fi)
574+
if [[ x"${STATE}" == x"timed out" ]]; then
575+
error "Timedout while attempting to stop teamd instances"
576+
else
577+
debug "Stopped teamd ..."
578+
fi
549579
fi
550580
551581
debug "Stopping syncd ..."
@@ -573,7 +603,7 @@ systemctl stop docker.service || debug "Ignore stopping docker service error $?"
573603
# Stop kernel modules for Nephos platform
574604
if [[ "$sonic_asic_type" = 'nephos' ]];
575605
then
576-
systemctl stop nps-modules-`uname -r`.service || debug "Ignore stopping nps service error $?"
606+
systemctl stop nps-modules-$(uname -r).service || debug "Ignore stopping nps service error $?"
577607
fi
578608
579609
if [[ "$REBOOT_TYPE" = "fast-reboot" ]]; then
@@ -617,6 +647,10 @@ if [ -x ${DEVPATH}/${PLATFORM}/${SSD_FW_UPDATE} ]; then
617647
${DEVPATH}/${PLATFORM}/${SSD_FW_UPDATE} ${REBOOT_TYPE}
618648
fi
619649
650+
# Restore the restart configuration for systemctl services
651+
debug "Reset service auto restart ..."
652+
grep -l "Restart=no" /usr/lib/systemd/system/*.service | xargs sed -i -e "s/\<Restart=no\>/Restart=always/"
653+
620654
# Reboot: explicity call Linux native reboot under sbin
621655
debug "Rebooting with ${REBOOT_METHOD} to ${NEXT_SONIC_IMAGE} ..."
622656
exec ${REBOOT_METHOD}

0 commit comments

Comments
 (0)