Skip to content

Commit 3ce8952

Browse files
stepanblyschaklguohan
authored andcommitted
[mellanox|ffb] use system level warm reboot for Mellanox fastfast boot (sonic-net#413)
* [mellanox|ffb] use system level warm reboot for Mellanox fastfast boot Signed-off-by: Stepan Blyschak <[email protected]> * [mellanox|ffb] don't allocate tty for docker exec Signed-off-by: Stepan Blyschak <[email protected]> * redirect stdout to /dev/null for redis commands and orch/syncd shutdown requests Signed-off-by: Stepan Blyschak <[email protected]> * fail on pkill -USR1 teamd only when teamd process not found Signed-off-by: Stepan Blyschak <[email protected]> * add error codes and mlnx specific error codes, add error() function Signed-off-by: Stepan Blyschak <[email protected]>
1 parent fee2a6b commit 3ce8952

File tree

2 files changed

+65
-104
lines changed

2 files changed

+65
-104
lines changed

scripts/fast-reboot

+64-103
Original file line numberDiff line numberDiff line change
@@ -11,13 +11,24 @@ VERBOSE=no
1111
FORCE=no
1212
REBOOT_METHOD="/sbin/reboot"
1313

14+
EXIT_SUCCESS=0
15+
EXIT_FAILURE=1
16+
EXIT_NOT_SUPPORTED=2
17+
EXIT_ORCHAGENT_SHUTDOWN=10
18+
EXIT_SYNCD_SHUTDOWN=11
19+
1420
# Check root privileges
1521
if [[ "$EUID" -ne 0 ]]
1622
then
1723
echo "This command must be run as root" >&2
18-
exit 1
24+
exit "${EXIT_FAILURE}"
1925
fi
2026

27+
function error()
28+
{
29+
echo $@ >&2
30+
}
31+
2132
function debug()
2233
{
2334
if [[ x"${VERBOSE}" == x"yes" ]]; then
@@ -36,7 +47,7 @@ function showHelpAndExit()
3647
echo " -k : reboot with /sbin/kexec -e"
3748
echo " -x : execute script with -x flag"
3849

39-
exit 0
50+
exit "${EXIT_SUCCESS}"
4051
}
4152

4253
function parseOptions()
@@ -83,34 +94,24 @@ function clear_warm_boot()
8394
fi
8495
}
8596
86-
function cleanup_except_table()
87-
{
88-
local REDIS_DB_NUMBER="$1"
89-
local TABLE_PREFIX="$2"
90-
redis-cli -n "${REDIS_DB_NUMBER}" eval "
91-
for _, k in ipairs(redis.call('keys', '*')) do
92-
if not string.match(k, '${TABLE_PREFIX}') then
93-
redis.call('del', k)
94-
end
95-
end
96-
" 0
97-
}
98-
9997
function initialize_pre_shutdown()
10098
{
10199
debug "Initialize pre-shutdown ..."
102100
TABLE="WARM_RESTART_TABLE|warm-shutdown"
103101
RESTORE_COUNT=`/usr/bin/redis-cli -n 6 hget "${TABLE}" restore_count`
104102
if [[ -z "$RESTORE_COUNT" ]]; then
105-
/usr/bin/redis-cli -n 6 hset "${TABLE}" restore_count 0
103+
/usr/bin/redis-cli -n 6 hset "${TABLE}" "restore_count" "0" > /dev/null
106104
fi
107-
/usr/bin/redis-cli -n 6 hset "${TABLE}" state requesting
105+
/usr/bin/redis-cli -n 6 hset "${TABLE}" "state" "requesting" > /dev/null
108106
}
109107
110108
function request_pre_shutdown()
111109
{
112110
debug "Requesting pre-shutdown ..."
113-
/usr/bin/docker exec -i syncd /usr/bin/syncd_request_shutdown --pre
111+
/usr/bin/docker exec -i syncd /usr/bin/syncd_request_shutdown --pre &> /dev/null || {
112+
error "Failed to request pre-shutdown"
113+
exit "${EXIT_SYNCD_SHUTDOWN}"
114+
}
114115
}
115116
116117
function wait_for_pre_shutdown_complete_or_fail()
@@ -145,12 +146,12 @@ function wait_for_pre_shutdown_complete_or_fail()
145146
146147
if [[ x"${STATE}" != x"pre-shutdown-succeeded" ]]; then
147148
debug "Syncd pre-shutdown failed: ${STATE} ..."
148-
exit 10
149+
exit "${EXIT_SYNCD_SHUTDOWN}"
149150
fi
150151
debug "Pre-shutdown succeeded ..."
151152
}
152153
153-
function backup_datebase()
154+
function backup_database()
154155
{
155156
debug "Backing up database ..."
156157
# Dump redis content to a file 'dump.rdb' in warmboot directory
@@ -162,8 +163,8 @@ function backup_datebase()
162163
redis.call('del', k)
163164
end
164165
end
165-
" 0
166-
redis-cli save
166+
" 0 > /dev/null
167+
redis-cli save > /dev/null
167168
docker cp database:/var/lib/redis/$REDIS_FILE $WARM_DIR
168169
docker exec -i database rm /var/lib/redis/$REDIS_FILE
169170
}
@@ -181,27 +182,17 @@ case "$REBOOT_TYPE" in
181182
REBOOT_TYPE="fastfast-reboot"
182183
BOOT_TYPE_ARG="fastfast"
183184
# source mlnx-ffb.sh file with
184-
# functions to check ISSU upgrade/do ISSU start
185+
# functions to check ISSU upgrade possibility
185186
source mlnx-ffb.sh
186-
187-
trap clear_warm_boot EXIT HUP INT QUIT TERM KILL ABRT ALRM
188-
189-
# Set warm reboot flag for some components.
190-
# In fastfast boot flow, only APPL layer dockers
191-
# are enabled to perform warm restart
192-
config warm_restart disable system
193-
config warm_restart disable swss
194-
config warm_restart enable bgp
195-
config warm_restart enable teamd
196187
else
197188
BOOT_TYPE_ARG="warm"
198-
trap clear_warm_boot EXIT HUP INT QUIT TERM KILL ABRT ALRM
199-
config warm_restart enable system
200189
fi
190+
trap clear_warm_boot EXIT HUP INT QUIT TERM KILL ABRT ALRM
191+
config warm_restart enable system
201192
;;
202193
*)
203-
echo "Not supported reboot type: $REBOOT_TYPE" >&2
204-
exit 1
194+
error "Not supported reboot type: $REBOOT_TYPE"
195+
exit "${EXIT_NOT_SUPPORTED}"
205196
;;
206197
esac
207198
@@ -222,75 +213,63 @@ elif grep -q onie_platform= /host/machine.conf; then
222213
KERNEL_IMAGE="/host$(echo $KERNEL_OPTIONS | cut -d ' ' -f 2)"
223214
BOOT_OPTIONS="$(echo $KERNEL_OPTIONS | sed -e 's/\s*linux\s*/BOOT_IMAGE=/') SONIC_BOOT_TYPE=${BOOT_TYPE_ARG}"
224215
else
225-
echo "Unknown bootloader. ${REBOOT_TYPE} is not supported."
226-
exit 1
216+
error "Unknown bootloader. ${REBOOT_TYPE} is not supported."
217+
exit "${EXIT_NOT_SUPPORTED}"
227218
fi
228219
INITRD=$(echo $KERNEL_IMAGE | sed 's/vmlinuz/initrd.img/g')
229220
230221
# Install new FW for mellanox platforms before control plane goes down
231222
# So on boot switch will not spend time to upgrade FW increasing the CP downtime
232223
if [[ "$sonic_asic_type" == "mellanox" ]]; then
224+
MLNX_EXIT_SUCCESS=0
225+
MLNX_EXIT_FW_ERROR=100
226+
MLNX_EXIT_FFB_FAILURE=101
233227
234-
if [[ "$REBOOT_TYPE" = "fastfast-reboot" ]]; then
235-
check_issu_enabled || {
236-
echo "Warm reboot is not supported by this HWSKU"
237-
exit 1
238-
}
228+
MLNX_FW_UPGRADE_SCRIPT="/usr/bin/mlnx-fw-upgrade.sh"
239229
240-
check_sdk_upgrade || {
241-
echo "Warm reboot is not supported"
242-
exit 1
230+
231+
if [[ "$REBOOT_TYPE" = "fastfast-reboot" ]]; then
232+
check_ffb || {
233+
error "Warm reboot is not supported"
234+
exit "${MLNX_EXIT_FFB_FAILURE}"
243235
}
244236
fi
245237
246-
echo "Prepare MLNX ASIC to ${REBOOT_TYPE}: install new FW if required"
247-
248-
MLNX_EXIT_SUCCESS="0"
249-
MLNX_EXIT_ERROR="1"
250-
251-
MLNX_FW_UPGRADE_SCRIPT="/usr/bin/mlnx-fw-upgrade.sh"
238+
debug "Prepare MLNX ASIC to ${REBOOT_TYPE}: install new FW if required"
252239
253240
${MLNX_FW_UPGRADE_SCRIPT} --upgrade
254241
MLNX_EXIT_CODE="$?"
255242
if [[ "${MLNX_EXIT_CODE}" != "${MLNX_EXIT_SUCCESS}" ]]; then
256-
echo "Failed to burn MLNX FW: errno=${MLNX_EXIT_CODE}"
257-
exit "${MLNX_EXIT_ERROR}"
258-
fi
259-
260-
if [[ "$REBOOT_TYPE" = "fastfast-reboot" ]]; then
261-
issu_start || {
262-
echo "ISSU start failed"
263-
echo "Cold reboot may be requiered to recover"
264-
exit 1
265-
}
243+
error "Failed to burn MLNX FW: errno=${MLNX_EXIT_CODE}"
244+
exit "${MLNX_EXIT_FW_ERROR}"
266245
fi
267246
fi
268247
269248
# Load kernel into the memory
270249
/sbin/kexec -l "$KERNEL_IMAGE" --initrd="$INITRD" --append="$BOOT_OPTIONS"
271250
272-
if [[ "$REBOOT_TYPE" = "fast-reboot" || "$REBOOT_TYPE" = "fastfast-reboot" ]]; then
251+
if [[ "$REBOOT_TYPE" = "fast-reboot" ]]; then
273252
# Dump the ARP and FDB tables to files also as default routes for both IPv4 and IPv6
274253
# into /host/fast-reboot
275254
mkdir -p /host/fast-reboot
276255
/usr/bin/fast-reboot-dump.py -t /host/fast-reboot
277256
fi
278257
279-
if [[ "$REBOOT_TYPE" = "warm-reboot" ]]; then
258+
if [[ "$REBOOT_TYPE" = "warm-reboot" || "$REBOOT_TYPE" = "fastfast-reboot" ]]; then
280259
# Freeze orchagent for warm restart
281260
# Try freeze 5 times, it is possible that the orchagent is in transient state and no opportunity to be freezed
282261
# Note: assume that 1 second is enough for orchagent to process the request and respone freeze or not
283262
debug "Pausing orchagent ..."
284263
for i in `seq 4 -1 0`; do
285-
docker exec -i swss /usr/bin/orchagent_restart_check -w 1000 && break
286-
echo "RESTARTCHECK failed $i" >&2
264+
docker exec -i swss /usr/bin/orchagent_restart_check -w 1000 > /dev/null && break
265+
error "RESTARTCHECK failed $i"
287266
if [[ "$i" = "0" ]]; then
288-
echo "RESTARTCHECK failed finally" >&2
267+
error "RESTARTCHECK failed finally"
289268
if [[ x"${FORCE}" == x"yes" ]]; then
290269
debug "Ignoring orchagent pausing failure ..."
291270
break;
292271
fi
293-
exit 10
272+
exit "${EXIT_ORCHAGENT_SHUTDOWN}"
294273
fi
295274
sleep 1
296275
done
@@ -313,38 +292,26 @@ if [[ "$REBOOT_TYPE" = "fast-reboot" ]]; then
313292
fi
314293
315294
# Kill swss dockers
316-
docker kill swss
317-
318-
319-
# Warm reboot: dump state to host disk
320-
if [[ "$REBOOT_TYPE" = "fastfast-reboot" ]]; then
321-
mkdir -p $WARM_DIR
322-
323-
# Dump route table form APPL DB.
324-
# This route table will be used by fpmsyncd
325-
# reconcialtion logic
326-
cleanup_except_table 0 'ROUTE_TABLE'
327-
cleanup_except_table 4 'WARM_RESTART_TABLE'
328-
cleanup_except_table 6 'WARM_RESTART_TABLE'
329-
330-
redis-cli -n 1 FLUSHDB
331-
redis-cli -n 2 FLUSHDB
332-
redis-cli -n 5 FLUSHDB
333-
334-
redis-cli save
335-
docker cp database:/var/lib/redis/$REDIS_FILE $WARM_DIR
336-
docker exec -i database rm /var/lib/redis/$REDIS_FILE
337-
fi
295+
docker kill swss > /dev/null
338296
339297
# Pre-shutdown syncd
340-
if [[ "$REBOOT_TYPE" = "warm-reboot" ]]; then
298+
if [[ "$REBOOT_TYPE" = "warm-reboot" || "$REBOOT_TYPE" = "fastfast-reboot" ]]; then
341299
initialize_pre_shutdown
342300
343301
request_pre_shutdown
344302
345303
wait_for_pre_shutdown_complete_or_fail
346304
347-
backup_datebase
305+
# Warm reboot: dump state to host disk
306+
if [[ "$REBOOT_TYPE" = "fastfast-reboot" ]]; then
307+
redis-cli -n 1 FLUSHDB > /dev/null
308+
redis-cli -n 2 FLUSHDB > /dev/null
309+
redis-cli -n 5 FLUSHDB > /dev/null
310+
fi
311+
312+
# TODO: backup_database preserves FDB_TABLE
313+
# need to cleanup as well for fastfast boot case
314+
backup_database
348315
fi
349316
350317
# Stop teamd gracefully
@@ -353,18 +320,12 @@ if [[ "$REBOOT_TYPE" = "warm-reboot" || "$REBOOT_TYPE" = "fastfast-reboot" ]]; t
353320
# Send USR1 signal to all teamd instances to stop them
354321
# It will prepare teamd for warm-reboot
355322
# Note: We must send USR1 signal before syncd, because it will send the last packet through CPU port
356-
docker exec -i teamd pkill -USR1 teamd > /dev/null
323+
docker exec -i teamd pkill -USR1 teamd || [ $? == 1 ] > /dev/null
357324
debug "Stopped teamd ..."
358325
fi
359326
360327
debug "Stopping syncd ..."
361-
# syncd service stop is capable of handling both warm/fast/cold shutdown
362-
if [[ "$sonic_asic_type" = "mellanox" ]]; then
363-
docker kill syncd
364-
else
365-
# syncd service stop is capable of handling both warm/fast/cold shutdown
366-
systemctl stop syncd
367-
fi
328+
systemctl stop syncd
368329
debug "Stopped syncd ..."
369330
370331
# Kill other containers to make the reboot faster
@@ -403,5 +364,5 @@ debug "Rebooting with ${REBOOT_METHOD} to ${NEXT_SONIC_IMAGE} ..."
403364
exec ${REBOOT_METHOD}
404365
405366
# Should never reach here
406-
echo "${REBOOT_TYPE} failed!" >&2
407-
exit 1
367+
error "${REBOOT_TYPE} failed!"
368+
exit "${EXIT_FAILURE}"

show/mlnx.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -81,7 +81,7 @@ def is_issu_status_enabled():
8181
# Get the SAI XML path from sai.profile
8282
sai_profile_path = '/{}/sai.profile'.format(HWSKU_PATH)
8383

84-
DOCKER_CAT_COMMAND = 'docker exec -ti {container_name} cat {path}'
84+
DOCKER_CAT_COMMAND = 'docker exec {container_name} cat {path}'
8585

8686
command = DOCKER_CAT_COMMAND.format(container_name=CONTAINER_NAME, path=sai_profile_path)
8787
sai_profile_content, _ = run_command(command, print_to_console=False)

0 commit comments

Comments
 (0)