Skip to content

Commit eb7945f

Browse files
authored
Warmboot script improvements - timeout exec, disable swss autorestart, remove trap (sonic-net#1495)
Below changes are made to warmboot/fastboot script: 1. Add timeout to make sure syncd shutdown request will return in time. 5s 2. Disable trap handler after +e. 3. Make sure that syncd pre-shutdown wait won't take more than 60 seconds. 4. Make sure subsequent docker exec won't stuck for long time 5. Before shutdown, check docker exec on the relevant docker containers still works.
1 parent c7d4947 commit eb7945f

File tree

1 file changed

+36
-16
lines changed

1 file changed

+36
-16
lines changed

scripts/fast-reboot

+36-16
Original file line numberDiff line numberDiff line change
@@ -47,7 +47,7 @@ function error()
4747
function debug()
4848
{
4949
if [[ x"${VERBOSE}" == x"yes" ]]; then
50-
echo `date` $@
50+
echo $(date) $@
5151
fi
5252
logger "$@"
5353
}
@@ -128,10 +128,10 @@ function clear_warm_boot()
128128
{
129129
common_clear
130130

131-
result=`timeout 10s config warm_restart disable; if [[ $? == 124 ]]; then echo timeout; else echo "code ($?)"; fi` || /bin/true
131+
result=$(timeout 10s config warm_restart disable; res=$?; if [[ $res == 124 ]]; then echo timeout; else echo "code ($res)"; fi) || /bin/true
132132
debug "Cancel warm-reboot: ${result}"
133133
134-
TIMESTAMP=`date +%Y%m%d-%H%M%S`
134+
TIMESTAMP=$(date +%Y%m%d-%H%M%S)
135135
if [[ -f ${WARM_DIR}/${REDIS_FILE} ]]; then
136136
mv -f ${WARM_DIR}/${REDIS_FILE} ${WARM_DIR}/${REDIS_FILE}.${TIMESTAMP} || /bin/true
137137
fi
@@ -155,7 +155,7 @@ function initialize_pre_shutdown()
155155
{
156156
debug "Initialize pre-shutdown ..."
157157
TABLE="WARM_RESTART_TABLE|warm-shutdown"
158-
RESTORE_COUNT=`sonic-db-cli STATE_DB hget "${TABLE}" restore_count`
158+
RESTORE_COUNT=$(sonic-db-cli STATE_DB hget "${TABLE}" restore_count)
159159
if [[ -z "$RESTORE_COUNT" ]]; then
160160
sonic-db-cli STATE_DB hset "${TABLE}" "restore_count" "0" > /dev/null
161161
fi
@@ -165,9 +165,10 @@ function initialize_pre_shutdown()
165165
function request_pre_shutdown()
166166
{
167167
debug "Requesting pre-shutdown ..."
168-
/usr/bin/docker exec -i syncd /usr/bin/syncd_request_shutdown --pre &> /dev/null || {
168+
STATE=$(timeout 5s docker exec syncd /usr/bin/syncd_request_shutdown --pre &> /dev/null; if [[ $? == 124 ]]; then echo "timed out"; fi)
169+
if [[ x"${STATE}" == x"timed out" ]]; then
169170
error "Failed to request pre-shutdown"
170-
}
171+
fi
171172
}
172173
173174
function recover_issu_bank_file()
@@ -205,33 +206,33 @@ function wait_for_pre_shutdown_complete_or_fail()
205206
STATE="requesting"
206207
declare -i waitcount
207208
declare -i retrycount
208-
waitcount=0
209209
retrycount=0
210+
start_time=$SECONDS
211+
elapsed_time=$(($SECONDS - $start_time))
210212
# Wait up to 60 seconds for pre-shutdown to complete
211-
while [[ ${waitcount} -lt 600 ]]; do
213+
while [[ ${elapsed_time} -lt 60 ]]; do
212214
# timeout doesn't work with -i option of "docker exec". Therefore we have
213215
# to invoke docker exec directly below.
214-
STATE=`timeout 5s sonic-db-cli STATE_DB hget "${TABLE}" state; if [[ $? == 124 ]]; then echo "timed out"; fi`
216+
STATE=$(timeout 5s sonic-db-cli STATE_DB hget "${TABLE}" state; if [[ $? == 124 ]]; then echo "timed out"; fi)
215217
216218
if [[ x"${STATE}" == x"timed out" ]]; then
217-
waitcount+=50
218219
retrycount+=1
219-
debug "Timed out getting pre-shutdown state (${waitcount}) retry count ${retrycount} ..."
220+
debug "Timed out getting pre-shutdown state, retry count ${retrycount} ..."
220221
if [[ retrycount -gt 2 ]]; then
221222
break
222223
fi
223224
elif [[ x"${STATE}" != x"requesting" ]]; then
224225
break
225226
else
226227
sleep 0.1
227-
waitcount+=1
228228
fi
229+
elapsed_time=$(($SECONDS - $start_time))
229230
done
230231
231232
if [[ x"${STATE}" != x"pre-shutdown-succeeded" ]]; then
232-
debug "Syncd pre-shutdown failed: ${STATE} ..."
233+
debug "Syncd pre-shutdown failed, state: ${STATE} ..."
233234
else
234-
debug "Pre-shutdown succeeded ..."
235+
debug "Pre-shutdown succeeded, state: ${STATE} ..."
235236
fi
236237
}
237238
@@ -259,7 +260,10 @@ function backup_database()
259260
260261
# Dump redis content to a file 'dump.rdb' in warmboot directory
261262
docker cp database:/var/lib/$target_db_inst/$REDIS_FILE $WARM_DIR
262-
docker exec -i database rm /var/lib/$target_db_inst/$REDIS_FILE
263+
STATE=$(timeout 5s docker exec database rm /var/lib/$target_db_inst/$REDIS_FILE; if [[ $? == 124 ]]; then echo "timed out"; fi)
264+
if [[ x"${STATE}" == x"timed out" ]]; then
265+
error "Timed out during attempting to remove Redis dump file from database container"
266+
fi
263267
}
264268
265269
function setup_control_plane_assistant()
@@ -309,10 +313,23 @@ function setup_reboot_variables()
309313
INITRD=$(echo $KERNEL_IMAGE | sed 's/vmlinuz/initrd.img/g')
310314
}
311315
316+
function check_docker_exec()
317+
{
318+
containers="radv bgp lldp swss database teamd syncd"
319+
for container in $containers; do
320+
STATE=$(timeout 1s docker exec $container echo "success"; if [[ $? == 124 ]]; then echo "timed out"; fi)
321+
if [[ x"${STATE}" == x"timed out" ]]; then
322+
error "Docker exec on $container timedout"
323+
exit "${EXIT_FAILURE}"
324+
fi
325+
done
326+
}
327+
312328
function reboot_pre_check()
313329
{
330+
check_docker_exec
314331
# Make sure that the file system is normal: read-write able
315-
filename="/host/test-`date +%Y%m%d-%H%M%S`"
332+
filename="/host/test-$(date +%Y%m%d-%H%M%S)"
316333
if [[ ! -f ${filename} ]]; then
317334
touch ${filename}
318335
fi
@@ -541,6 +558,9 @@ fi
541558
# service will go down and we cannot recover from it.
542559
set +e
543560
561+
# disable trap-handlers which were set before
562+
trap '' EXIT HUP INT QUIT TERM KILL ABRT ALRM
563+
544564
if [ -x ${LOG_SSD_HEALTH} ]; then
545565
debug "Collecting logs to check ssd health before ${REBOOT_TYPE}..."
546566
${LOG_SSD_HEALTH}

0 commit comments

Comments
 (0)