Skip to content

Commit 29771e7

Browse files
[techsupport] improve robustness (sonic-net#2117)
- What I did Execute the command in a separate bash process under timeout. Due to with '--foreground' the child processes are not killed and still have the file descriptor opened to piped process which will basically hang forever. From the "timeout" man page: --foreground when not running timeout directly from a shell prompt, allow COMMAND to read from the TTY and get TTY signals; in this mode, children of COMMAND will not be timed out So, a command like this: timeout --foreground 5s some_process_that_spawn_childs_that_produce_output | tee some_file Will actually hang forever. The '--foreground' option is still needed. So the proposed solution is to run that construct in another bash process: timeout --foreground 5s bash -c "some_process_that_spawn_childs_that_produce_output | tee some_file" Moved hw-mgmt-dump to collect_mellanox function and execute it under timeout. Handle global timeout: When global timeout is reached the SIGTERM is sent to generate_dump process, upon SIGTERM the generate_dump process will finalize its work by creating a dump of what already has been collected, providing a dump even when timeout is reached. The generate_dump will be killed in case it cannot create dump after initial SIGTERM. - How I did it I implemented the above mentioned features. - How to verify it Artificially hang the FW, execute "show techsupport" and verify even in case timeout is reached the dump is generated. Signed-off-by: Stepan Blyschak <[email protected]>
1 parent f9dc681 commit 29771e7

File tree

3 files changed

+118
-57
lines changed

3 files changed

+118
-57
lines changed

scripts/generate_dump

+85-45
Original file line numberDiff line numberDiff line change
@@ -13,8 +13,12 @@ EXT_RECVSIG=3
1313
EXT_RETRY=4
1414
EXT_TAR_FAILED=5
1515
EXT_PROCFS_SAVE_FAILED=6
16+
EXT_INTERRUPTED=7
17+
EXT_TERMINATED=8
1618
EXT_INVALID_ARGUMENT=10
1719

20+
TIMEOUT_EXIT_CODE=124
21+
1822
TAR=tar
1923
MKDIR=mkdir
2024
RM=rm
@@ -61,6 +65,8 @@ rm_lock_and_exit()
6165
handle_exit()
6266
{
6367
ECODE=$?
68+
echo "Cleaning up working directory $TARDIR"
69+
$RM -rf $TARDIR
6470
echo "Removing lock. Exit: $ECODE" >&2
6571
$RM $V -rf ${LOCKDIR}
6672
# Echo the filename as the last statement if the generation succeeds
@@ -69,11 +75,16 @@ handle_exit()
6975
fi
7076
}
7177

72-
handle_signal()
78+
handle_sigint()
7379
{
7480
echo "Generate Dump received interrupt" >&2
75-
$RM $V -rf $TARDIR
76-
exit $EXT_RECVSIG
81+
exit $EXT_INTERRUPTED
82+
}
83+
84+
handle_sigterm() {
85+
echo "Dump generation terminated" >&2
86+
finalize
87+
exit $EXT_TERMINATED
7788
}
7889

7990
handle_error() {
@@ -83,6 +94,10 @@ handle_error() {
8394
fi
8495
}
8596

97+
escape_quotes() {
98+
echo $1 | sed 's/\"/\\\"/g'
99+
}
100+
86101
save_bcmcmd() {
87102
trap 'handle_error $? $LINENO' ERR
88103
local start_t=$(date +%s%3N)
@@ -93,6 +108,7 @@ save_bcmcmd() {
93108
local do_gzip=${3:-false}
94109
local tarpath="${BASE}/dump/$filename"
95110
local timeout_cmd="timeout --foreground ${TIMEOUT_MIN}m"
111+
local cmd=$(escape_quotes "$cmd")
96112
if [ ! -d $LOGDIR ]; then
97113
$MKDIR $V -p $LOGDIR
98114
fi
@@ -106,12 +122,12 @@ save_bcmcmd() {
106122
# as one argument, e.g. vtysh -c "COMMAND HERE" needs to have
107123
# "COMMAND HERE" bunched together as 1 arg to vtysh -c
108124
if $NOOP; then
109-
echo "${timeout_cmd} $cmd &> '${filepath}'"
125+
echo "${timeout_cmd} bash -c \"${cmd}\" &> '${filepath}'"
110126
else
111127
ret=0
112-
eval "${timeout_cmd} $cmd" &> "${filepath}" || ret=$?
128+
eval "${timeout_cmd} bash -c \"${cmd}\" &> '${filepath}'" || ret=$?
113129
if [ $ret -ne 0 ]; then
114-
if [ $ret -eq 124 ]; then
130+
if [ $ret -eq $TIMEOUT_EXIT_CODE ]; then
115131
echo "Command: $cmd timedout after ${TIMEOUT_MIN} minutes."
116132
else
117133
RC=0
@@ -207,6 +223,8 @@ save_cmd() {
207223
redirect_eval=""
208224
fi
209225

226+
local cmd=$(escape_quotes "$cmd")
227+
local cleanup_method_declration=$(declare -f $cleanup_method)
210228
# eval required here to re-evaluate the $cmd properly at runtime
211229
# This is required if $cmd has quoted strings that should be bunched
212230
# as one argument, e.g. vtysh -c "COMMAND HERE" needs to have
@@ -215,25 +233,29 @@ save_cmd() {
215233
tarpath="${tarpath}.gz"
216234
filepath="${filepath}.gz"
217235
# cleanup_method will run in a sub-shell, need declare it first
218-
local cleanup_method_declration=$(declare -f $cleanup_method)
219236
local cmds="$cleanup_method_declration; $cmd $redirect_eval | $cleanup_method | gzip -c > '${filepath}'"
220237
if $NOOP; then
221238
echo "${timeout_cmd} bash -c \"${cmds}\""
222239
else
223240
RC=0
224241
eval "${timeout_cmd} bash -c \"${cmds}\"" || RC=$?
225-
if [ $RC -ne 0 ]; then
242+
if [ $RC -eq $TIMEOUT_EXIT_CODE ]; then
226243
echo "Command: $cmds timedout after ${TIMEOUT_MIN} minutes."
244+
elif [ $RC -ne 0 ]; then
245+
echo "Command: $cmds failed with RC $RC"
227246
fi
228247
fi
229248
else
249+
local cmds="$cleanup_method_declration; $cmd | $cleanup_method $redirect '$filepath'"
230250
if $NOOP; then
231-
echo "${timeout_cmd} $cmd | $cleanup_method $redirect '$filepath'"
251+
echo "${timeout_cmd} bash -c \"${cmds}\""
232252
else
233253
RC=0
234-
eval "${timeout_cmd} $cmd | $cleanup_method" "$redirect" "$filepath" || RC=$?
235-
if [ $RC -ne 0 ]; then
236-
echo "Command: $cmd timedout after ${TIMEOUT_MIN} minutes."
254+
eval "${timeout_cmd} bash -c \"${cmds}\"" || RC=$?
255+
if [ $RC -eq $TIMEOUT_EXIT_CODE ]; then
256+
echo "Command: $cmds timedout after ${TIMEOUT_MIN} minutes."
257+
elif [ $RC -ne 0 ]; then
258+
echo "Command: $cmds failed with RC $RC"
237259
fi
238260
fi
239261
fi
@@ -484,20 +506,20 @@ save_bgp_neighbor() {
484506
local asic_id=${1:-""}
485507
local ns=$(get_vtysh_namespace $asic_id)
486508

487-
neighbor_list_v4=$(${timeout_cmd} vtysh $ns -c "show ip bgp neighbors" | grep "BGP neighbor is" | awk -F '[, ]' '{print $4}')
509+
neighbor_list_v4=$(${timeout_cmd} bash -c "vtysh $ns -c 'show ip bgp neighbors' | grep 'BGP neighbor is' | awk -F '[, ]' '{print \$4}'")
488510
for word in $neighbor_list_v4; do
489511
save_cmd "vtysh $ns -c \"show ip bgp neighbors $word advertised-routes\"" "ip.bgp.neighbor.$word.adv$asic_id"
490512
save_cmd "vtysh $ns -c \"show ip bgp neighbors $word routes\"" "ip.bgp.neighbor.$word.rcv$asic_id"
491513
done
492-
neighbor_list_v6=$(vtysh $ns -c "show bgp ipv6 neighbors" | grep "BGP neighbor is" | awk -F '[, ]' '{print $4}' | fgrep ':')
514+
neighbor_list_v6=$(${timeout_cmd} bash -c "vtysh $ns -c 'show bgp ipv6 neighbors' | grep 'BGP neighbor is' | awk -F '[, ]' '{print \$4}' | fgrep ':'")
493515
for word in $neighbor_list_v6; do
494516
save_cmd "vtysh $ns -c \"show bgp ipv6 neighbors $word advertised-routes\"" "ipv6.bgp.neighbor.$word.adv$asic_id"
495517
save_cmd "vtysh $ns -c \"show bgp ipv6 neighbors $word routes\"" "ipv6.bgp.neighbor.$word.rcv$asic_id"
496518
done
497519

498-
vrf_list=`${timeout_cmd} vtysh $ns -c "show vrf" | awk -F" " '{print $2}'`
520+
vrf_list=`${timeout_cmd} bash -c "vtysh $ns -c 'show vrf' | awk -F" " '{print \$2}'"`
499521
for vrf in $vrf_list; do
500-
neighbor_list=`${timeout_cmd} vtysh $ns -c "show ip bgp vrf $vrf neighbors" | grep "BGP neighbor is" | awk -F '[, ]' '{print $4}'`
522+
neighbor_list=`${timeout_cmd} bash -c "vtysh $ns -c 'show ip bgp vrf $vrf neighbors' | grep 'BGP neighbor is' | awk -F '[, ]' '{print \$4}'"`
501523
for word in $neighbor_list; do
502524
save_cmd "vtysh $ns -c \"show ip bgp vrf $vrf neighbors $word advertised-routes\"" "ip.bgp.neighbor.$vrf.$word.adv$asic_id"
503525
save_cmd "vtysh $ns -c \"show ip bgp vrf $vrf neighbors $word routes\"" "ip.bgp.neighbor.$vrf.$word.rcv$asic_id"
@@ -737,7 +759,7 @@ save_platform_info() {
737759
save_cmd "show platform psustatus" "psustatus"
738760
save_cmd "show platform ssdhealth" "ssdhealth"
739761
save_cmd "show platform temperature" "temperature"
740-
save_cmd "show platform fan" "fan"
762+
save_cmd "show platform fan" "fan"
741763
fi
742764
}
743765

@@ -856,6 +878,7 @@ enable_logrotate() {
856878
###############################################################################
857879
collect_mellanox() {
858880
trap 'handle_error $? $LINENO' ERR
881+
local timeout_cmd="timeout --foreground ${TIMEOUT_MIN}m"
859882
local sai_dump_folder="/tmp/saisdkdump"
860883
local sai_dump_filename="${sai_dump_folder}/sai_sdk_dump_$(date +"%m_%d_%Y_%I_%M_%p")"
861884

@@ -865,12 +888,12 @@ collect_mellanox() {
865888
copy_from_docker syncd $sai_dump_folder $sai_dump_folder
866889
echo "$sai_dump_folder"
867890
for file in `ls $sai_dump_folder`; do
868-
save_file ${sai_dump_folder}/${file} sai_sdk_dump true
891+
save_file ${sai_dump_folder}/${file} sai_sdk_dump true
869892
done
870893

871894
${CMD_PREFIX}rm -rf $sai_dump_folder
872895
${CMD_PREFIX}docker exec syncd rm -rf $sai_dump_folder
873-
896+
874897
# Save SDK error dumps
875898
local sdk_dump_path=`${CMD_PREFIX}docker exec syncd cat /tmp/sai.profile|grep "SAI_DUMP_STORE_PATH"|cut -d = -f2`
876899
if [[ -d $sdk_dump_path ]]; then
@@ -880,6 +903,26 @@ collect_mellanox() {
880903
done
881904
rm -rf /tmp/sdk-dumps
882905
fi
906+
907+
# run 'hw-management-generate-dump.sh' script and save the result file
908+
HW_DUMP_FILE=/usr/bin/hw-management-generate-dump.sh
909+
if [ -f "$HW_DUMP_FILE" ]; then
910+
${CMD_PREFIX}${timeout_cmd} /usr/bin/hw-management-generate-dump.sh $ALLOW_PROCESS_STOP
911+
ret=$?
912+
if [ $ret -ne 0 ]; then
913+
if [ $ret -eq $TIMEOUT_EXIT_CODE ]; then
914+
echo "hw-management dump timedout after ${TIMEOUT_MIN} minutes."
915+
else
916+
echo "hw-management dump failed ..."
917+
fi
918+
else
919+
save_file "/tmp/hw-mgmt-dump*" "hw-mgmt" false
920+
rm -f /tmp/hw-mgmt-dump*
921+
fi
922+
else
923+
echo "HW Mgmt dump script $HW_DUMP_FILE does not exist"
924+
fi
925+
883926
}
884927

885928
###############################################################################
@@ -1087,12 +1130,11 @@ save_crash_files() {
10871130
get_asic_count() {
10881131
trap 'handle_error $? $LINENO' ERR
10891132
local redirect_eval="2>&1"
1090-
if ! $SAVE_STDERR
1133+
if ! $SAVE_STDERR
10911134
then
10921135
redirect_eval=""
10931136
fi
1094-
local cmd="show platform summary --json | python -c 'import sys, json; \
1095-
print(json.load(sys.stdin)[\"asic_count\"])'"
1137+
local cmd="python -c 'from sonic_py_common.multi_asic import get_num_asics; print(get_num_asics())'"
10961138
echo `eval ${cmd} ${redirect_eval}`
10971139
}
10981140

@@ -1199,6 +1241,11 @@ main() {
11991241
end_t=$(date +%s%3N)
12001242
echo "[ Capture Proc State ] : $(($end_t-$start_t)) msec" >> $TECHSUPPORT_TIME_INFO
12011243

1244+
# Save logs and cores early
1245+
save_log_files
1246+
save_crash_files
1247+
save_warmboot_files
1248+
12021249
# Save all the processes within each docker
12031250
save_cmd "show services" services.summary
12041251

@@ -1265,14 +1312,14 @@ main() {
12651312
save_bfd_info
12661313
save_redis_info
12671314

1268-
if $DEBUG_DUMP
1315+
if $DEBUG_DUMP
12691316
then
12701317
save_dump_state_all_ns
12711318
fi
12721319

12731320
save_cmd "docker ps -a" "docker.ps"
12741321
save_cmd "docker top pmon" "docker.pmon"
1275-
1322+
12761323
if [[ -d ${PLUGINS_DIR} ]]; then
12771324
local -r dump_plugins="$(find ${PLUGINS_DIR} -type f -executable)"
12781325
for plugin in $dump_plugins; do
@@ -1333,25 +1380,16 @@ main() {
13331380
end_t=$(date +%s%3N)
13341381
echo "[ TAR /etc Files ] : $(($end_t-$start_t)) msec" >> $TECHSUPPORT_TIME_INFO
13351382

1336-
save_log_files
1337-
save_warmboot_files
1338-
save_crash_files
1383+
finalize
1384+
}
13391385

1340-
# run 'hw-management-generate-dump.sh' script and save the result file
1341-
HW_DUMP_FILE=/usr/bin/hw-management-generate-dump.sh
1342-
if [ -f "$HW_DUMP_FILE" ]; then
1343-
/usr/bin/hw-management-generate-dump.sh $ALLOW_PROCESS_STOP
1344-
save_file "/tmp/hw-mgmt-dump*" "hw-mgmt" false
1345-
rm -f /tmp/hw-mgmt-dump*
1346-
else
1347-
echo "HW Mgmt dump script $HW_DUMP_FILE does not exist"
1348-
fi
1386+
###############################################################################
1387+
# Finalize dump generation
1388+
###############################################################################
1389+
finalize() {
13491390
# Save techsupport timing profile info
13501391
save_file $TECHSUPPORT_TIME_INFO log false
13511392

1352-
# clean up working tar dir before compressing
1353-
$RM $V -rf $TARDIR
1354-
13551393
if $DO_COMPRESS; then
13561394
RC=0
13571395
$GZIP $V $TARFILE || RC=$?
@@ -1364,13 +1402,14 @@ main() {
13641402

13651403
# Invoke the TechSupport Cleanup Hook
13661404
setsid python3 /usr/local/bin/techsupport_cleanup.py ${TARFILE} &> /tmp/techsupport_cleanup.log &
1367-
1405+
13681406
if ! $SAVE_STDERR
13691407
then
13701408
exit $RETURN_CODE
13711409
fi
13721410
}
13731411

1412+
13741413
###############################################################################
13751414
# Remove secret from pipeline inout and output result to pipeline.
13761415
# Globals:
@@ -1416,7 +1455,7 @@ remove_secret_from_etc_files() {
14161455

14171456
# Remove snmp community string from snmp.yml
14181457
sed -i -E 's/(\s*snmp_\S*community\s*:\s*)(\S*)/\1****/g' $dumppath/etc/sonic/snmp.yml
1419-
1458+
14201459
# Remove secret from /etc/sonic/config_db.json
14211460
cat $dumppath/etc/sonic/config_db.json | remove_secret_from_config_db_dump > $dumppath/etc/sonic/config_db.json.temp
14221461
mv $dumppath/etc/sonic/config_db.json.temp $dumppath/etc/sonic/config_db.json
@@ -1475,9 +1514,9 @@ OPTIONS
14751514
"24 March", "yesterday", etc.
14761515
-t TIMEOUT_MINS
14771516
Command level timeout in minutes
1478-
-r
1517+
-r
14791518
Redirect any intermediate errors to STDERR
1480-
-d
1519+
-d
14811520
Collect the output of debug dump cli
14821521
EOF
14831522
}
@@ -1527,7 +1566,7 @@ while getopts ":xnvhzas:t:r:d" opt; do
15271566
r)
15281567
SAVE_STDERR=false
15291568
;;
1530-
d)
1569+
d)
15311570
DEBUG_DUMP=true
15321571
;;
15331572
/?)
@@ -1553,7 +1592,8 @@ if $MKDIR "${LOCKDIR}" &>/dev/null; then
15531592
echo "$$" > "${PIDFILE}"
15541593
# This handler will exit the script upon receiving these interrupts
15551594
# Trap configured on EXIT will be triggered by the exit from handle_signal function
1556-
trap 'handle_signal' SIGINT SIGHUP SIGQUIT SIGTERM
1595+
trap 'handle_sigterm' SIGHUP SIGQUIT SIGTERM
1596+
trap 'handle_sigint' SIGINT
15571597
echo "Lock succesfully accquired and installed signal handlers"
15581598
# Proceed with the actual code
15591599
if [[ ! -z "${V}" ]]; then

show/main.py

+9-12
Original file line numberDiff line numberDiff line change
@@ -68,25 +68,22 @@
6868

6969
GEARBOX_TABLE_PHY_PATTERN = r"_GEARBOX_TABLE:phy:*"
7070

71+
COMMAND_TIMEOUT = 300
72+
7173
# To be enhanced. Routing-stack information should be collected from a global
7274
# location (configdb?), so that we prevent the continous execution of this
7375
# bash oneliner. To be revisited once routing-stack info is tracked somewhere.
7476
def get_routing_stack():
77+
result = None
7578
command = "sudo docker ps | grep bgp | awk '{print$2}' | cut -d'-' -f3 | cut -d':' -f1 | head -n 1"
7679

7780
try:
78-
proc = subprocess.Popen(command,
79-
stdout=subprocess.PIPE,
80-
shell=True,
81-
text=True)
82-
stdout = proc.communicate()[0]
83-
proc.wait()
81+
stdout = subprocess.check_output(command, shell=True, timeout=COMMAND_TIMEOUT)
8482
result = stdout.rstrip('\n')
83+
except Exception as err:
84+
click.echo('Failed to get routing stack: {}'.format(err), err=True)
8585

86-
except OSError as e:
87-
raise OSError("Cannot detect routing-stack")
88-
89-
return (result)
86+
return result
9087

9188

9289
# Global Routing-Stack variable
@@ -1149,7 +1146,7 @@ def users(verbose):
11491146
@click.option('--redirect-stderr', '-r', is_flag=True, help="Redirect an intermediate errors to STDERR")
11501147
def techsupport(since, global_timeout, cmd_timeout, verbose, allow_process_stop, silent, debug_dump, redirect_stderr):
11511148
"""Gather information for troubleshooting"""
1152-
cmd = "sudo timeout -s SIGTERM --foreground {}m".format(global_timeout)
1149+
cmd = "sudo timeout --kill-after={}s -s SIGTERM --foreground {}m".format(COMMAND_TIMEOUT, global_timeout)
11531150

11541151
if allow_process_stop:
11551152
cmd += " -a"
@@ -1164,7 +1161,7 @@ def techsupport(since, global_timeout, cmd_timeout, verbose, allow_process_stop,
11641161
cmd += " -s '{}'".format(since)
11651162

11661163
if debug_dump:
1167-
cmd += " -d "
1164+
cmd += " -d"
11681165

11691166
cmd += " -t {}".format(cmd_timeout)
11701167
if redirect_stderr:

0 commit comments

Comments
 (0)