Skip to content

Commit 6de91af

Browse files
authored
[Auto-Techsupport] Issues related to Multiple Cores crashing handled (#1948)
#### What I did **Issues seen when multiple cores are crashed in very quick succession:** 1) The **rate_limit_interval** is not honored. Because, i previously was finding out the last created tech-support using the glob pattern `sonic_dump_*tar*`, which will not include the dumps which are being currently run. These existing dump will not have .tar.gz extension. Thus, modified the `get_ts_dumps` to search based on the TS_ROOT i.e `sonic_dump_*` 2) **show auto-tech support history** is not showing all the created dumps. I've previously used to take the diff of tech support dumps before and after running the invocation and used to assign the diff as the corresponding techsupport for this core. This approach is prone to race condition as we can have multiple dumps in the diff found in the interval. Avoided this by parsing the stdout returned by `show techsupport` invocation #### How to verify it 1) Unit Tests 2) Generate core-dumps in very quick succession. Use the default rate limit interval. Should only see one entry in tech-support history 3) Set global rate limit interval to 0. Generate cores in quick succession. Should see a few entries in the history.
1 parent 656ade1 commit 6de91af

File tree

5 files changed

+67
-42
lines changed

5 files changed

+67
-42
lines changed

scripts/coredump_gen_handler.py

+20-13
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
import time
99
import argparse
1010
import syslog
11+
import re
1112
from swsscommon.swsscommon import SonicV2Connector
1213
from utilities_common.auto_techsupport_helper import *
1314

@@ -54,7 +55,6 @@ def __init__(self, core_name, container_name, db):
5455
self.db = db
5556
self.proc_mp = {}
5657
self.core_ts_map = {}
57-
self.curr_ts_list = []
5858

5959
def handle_core_dump_creation_event(self):
6060
file_path = os.path.join(CORE_DUMP_DIR, self.core_name)
@@ -93,7 +93,7 @@ def handle_core_dump_creation_event(self):
9393
since_cfg = self.get_since_arg()
9494
new_file = self.invoke_ts_cmd(since_cfg)
9595
if new_file:
96-
self.write_to_state_db(int(time.time()), new_file[0])
96+
self.write_to_state_db(int(time.time()), new_file)
9797

9898
def write_to_state_db(self, timestamp, ts_dump):
9999
name = strip_ts_ext(ts_dump)
@@ -111,26 +111,33 @@ def get_since_arg(self):
111111
return since_cfg
112112
return SINCE_DEFAULT
113113

114+
def parse_ts_dump_name(self, ts_stdout):
115+
""" Figure out the ts_dump name from the techsupport stdout """
116+
matches = re.findall(TS_PTRN, ts_stdout)
117+
if matches:
118+
return matches[-1]
119+
syslog.syslog(syslog.LOG_ERR, "stdout of the 'show techsupport' cmd doesn't have the dump name")
120+
return ""
121+
114122
def invoke_ts_cmd(self, since_cfg):
115123
since_cfg = "'" + since_cfg + "'"
116-
cmd = " ".join(["show", "techsupport", "--since", since_cfg])
117-
rc, _, stderr = subprocess_exec(["show", "techsupport", "--since", since_cfg], env=ENV_VAR)
124+
cmd_opts = ["show", "techsupport", "--silent", "--since", since_cfg]
125+
cmd = " ".join(cmd_opts)
126+
rc, stdout, stderr = subprocess_exec(cmd_opts, env=ENV_VAR)
118127
if not rc:
119128
syslog.syslog(syslog.LOG_ERR, "show techsupport failed with exit code {}, stderr:{}".format(rc, stderr))
120-
new_list = get_ts_dumps(True)
121-
diff = list(set(new_list).difference(set(self.curr_ts_list)))
122-
self.curr_ts_list = new_list
123-
if not diff:
129+
new_dump = self.parse_ts_dump_name(stdout)
130+
if not new_dump:
124131
syslog.syslog(syslog.LOG_ERR, "{} was run, but no techsupport dump is found".format(cmd))
125132
else:
126-
syslog.syslog(syslog.LOG_INFO, "{} is successful, {} is created".format(cmd, diff))
127-
return diff
133+
syslog.syslog(syslog.LOG_INFO, "{} is successful, {} is created".format(cmd, new_dump))
134+
return new_dump
128135

129136
def verify_rate_limit_intervals(self, global_cooloff, container_cooloff):
130137
"""Verify both the global and per-proc rate_limit_intervals have passed"""
131-
self.curr_ts_list = get_ts_dumps(True)
132-
if global_cooloff and self.curr_ts_list:
133-
last_ts_dump_creation = os.path.getmtime(self.curr_ts_list[-1])
138+
curr_ts_list = get_ts_dumps(True)
139+
if global_cooloff and curr_ts_list:
140+
last_ts_dump_creation = os.path.getmtime(curr_ts_list[-1])
134141
if time.time() - last_ts_dump_creation < global_cooloff:
135142
msg = "Global rate_limit_interval period has not passed. Techsupport Invocation is skipped. Core: {}"
136143
syslog.syslog(syslog.LOG_INFO, msg.format(self.core_name))

scripts/generate_dump

+1-2
Original file line numberDiff line numberDiff line change
@@ -1339,8 +1339,7 @@ main() {
13391339
fi
13401340

13411341
# Invoke the TechSupport Cleanup Hook
1342-
setsid $(echo > /tmp/techsupport_cleanup.log;
1343-
python3 /usr/local/bin/techsupport_cleanup.py ${TARFILE} &>> /tmp/techsupport_cleanup.log) &
1342+
setsid python3 /usr/local/bin/techsupport_cleanup.py ${TARFILE} &> /tmp/techsupport_cleanup.log &
13441343

13451344
echo ${TARFILE}
13461345

scripts/techsupport_cleanup.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@ def handle_techsupport_creation_event(dump_name, db):
2222
file_path = os.path.join(TS_DIR, dump_name)
2323
if not verify_recent_file_creation(file_path):
2424
return
25-
_ , num_bytes = get_stats(os.path.join(TS_DIR, TS_PTRN))
25+
_ , num_bytes = get_stats(os.path.join(TS_DIR, TS_PTRN_GLOB))
2626

2727
if db.get(CFG_DB, AUTO_TS, CFG_STATE) != "enabled":
2828
msg = "techsupport_cleanup is disabled. No cleanup is performed. current size occupied : {}"
@@ -40,7 +40,7 @@ def handle_techsupport_creation_event(dump_name, db):
4040
syslog.syslog(syslog.LOG_NOTICE, msg.format(pretty_size(num_bytes)))
4141
return
4242

43-
removed_files = cleanup_process(max_ts, TS_PTRN, TS_DIR)
43+
removed_files = cleanup_process(max_ts, TS_PTRN_GLOB, TS_DIR)
4444
clean_state_db_entries(removed_files, db)
4545

4646

tests/coredump_gen_handler_test.py

+36-21
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,11 @@
1212
sys.path.append("scripts")
1313
import coredump_gen_handler as cdump_mod
1414

15+
AUTO_TS_STDOUT="""
16+
Techsupport is running with silent option. This command might take a long time.
17+
The SAI dump is generated to /tmp/saisdkdump/sai_sdk_dump_11_22_2021_11_07_PM
18+
/tmp/saisdkdump
19+
"""
1520

1621
def set_auto_ts_cfg(redis_mock, state="disabled",
1722
rate_limit_interval="0",
@@ -74,12 +79,13 @@ def test_invoc_ts_state_db_update(self):
7479
populate_state_db(redis_mock)
7580
with Patcher() as patcher:
7681
def mock_cmd(cmd, env):
82+
ts_dump = "/var/dump/sonic_dump_random3.tar.gz"
7783
cmd_str = " ".join(cmd)
7884
if "show techsupport" in cmd_str:
79-
patcher.fs.create_file("/var/dump/sonic_dump_random3.tar.gz")
85+
patcher.fs.create_file(ts_dump)
8086
else:
8187
return 1, "", "Command Not Found"
82-
return 0, "", ""
88+
return 0, AUTO_TS_STDOUT + ts_dump, ""
8389
cdump_mod.subprocess_exec = mock_cmd
8490
patcher.fs.create_file("/var/dump/sonic_dump_random1.tar.gz")
8591
patcher.fs.create_file("/var/dump/sonic_dump_random2.tar.gz")
@@ -105,12 +111,13 @@ def test_global_rate_limit_interval(self):
105111
populate_state_db(redis_mock)
106112
with Patcher() as patcher:
107113
def mock_cmd(cmd, env):
114+
ts_dump = "/var/dump/sonic_dump_random3.tar.gz"
108115
cmd_str = " ".join(cmd)
109116
if "show techsupport" in cmd_str:
110-
patcher.fs.create_file("/var/dump/sonic_dump_random3.tar.gz")
117+
patcher.fs.create_file(ts_dump)
111118
else:
112119
return 1, "", "Command Not Found"
113-
return 0, "", ""
120+
return 0, AUTO_TS_STDOUT + ts_dump, ""
114121
cdump_mod.subprocess_exec = mock_cmd
115122
patcher.fs.create_file("/var/dump/sonic_dump_random1.tar.gz")
116123
patcher.fs.create_file("/var/dump/sonic_dump_random2.tar.gz")
@@ -138,12 +145,13 @@ def test_per_container_rate_limit_interval(self):
138145
"orchagent;{};swss".format(int(time.time()))})
139146
with Patcher() as patcher:
140147
def mock_cmd(cmd, env):
148+
ts_dump = "/var/dump/sonic_dump_random3.tar.gz"
141149
cmd_str = " ".join(cmd)
142150
if "show techsupport" in cmd_str:
143-
patcher.fs.create_file("/var/dump/sonic_dump_random3.tar.gz")
151+
patcher.fs.create_file(ts_dump)
144152
else:
145153
return 1, "", "Command Not Found"
146-
return 0, "", ""
154+
return 0, AUTO_TS_STDOUT + ts_dump, ""
147155
cdump_mod.subprocess_exec = mock_cmd
148156
patcher.fs.create_file("/var/dump/sonic_dump_random1.tar.gz")
149157
patcher.fs.create_file("/var/core/orchagent.12345.123.core.gz")
@@ -167,12 +175,13 @@ def test_invoc_ts_after_rate_limit_interval(self):
167175
"orchagent;{};swss".format(int(time.time()))})
168176
with Patcher() as patcher:
169177
def mock_cmd(cmd, env):
178+
ts_dump = "/var/dump/sonic_dump_random3.tar.gz"
170179
cmd_str = " ".join(cmd)
171180
if "show techsupport" in cmd_str:
172-
patcher.fs.create_file("/var/dump/sonic_dump_random3.tar.gz")
181+
patcher.fs.create_file(ts_dump)
173182
else:
174183
return 1, "", "Command Not Found"
175-
return 0, "", ""
184+
return 0, AUTO_TS_STDOUT + ts_dump, ""
176185
cdump_mod.subprocess_exec = mock_cmd
177186
patcher.fs.create_file("/var/dump/sonic_dump_random1.tar.gz")
178187
patcher.fs.create_file("/var/dump/sonic_dump_random2.tar.gz")
@@ -197,12 +206,13 @@ def test_core_dump_with_invalid_container_name(self):
197206
populate_state_db(redis_mock, {})
198207
with Patcher() as patcher:
199208
def mock_cmd(cmd, env):
209+
ts_dump = "/var/dump/sonic_dump_random3.tar.gz"
200210
cmd_str = " ".join(cmd)
201211
if "show techsupport" in cmd_str:
202-
patcher.fs.create_file("/var/dump/sonic_dump_random3.tar.gz")
212+
patcher.fs.create_file(ts_dump)
203213
else:
204214
return 1, "", "Command Not Found"
205-
return 0, "", ""
215+
return 0, AUTO_TS_STDOUT + ts_dump, ""
206216
cdump_mod.subprocess_exec = mock_cmd
207217
patcher.fs.create_file("/var/dump/sonic_dump_random1.tar.gz")
208218
patcher.fs.create_file("/var/core/snmpd.12345.123.core.gz")
@@ -225,12 +235,13 @@ def test_feature_table_not_set(self):
225235
populate_state_db(redis_mock, {})
226236
with Patcher() as patcher:
227237
def mock_cmd(cmd, env):
238+
ts_dump = "/var/dump/sonic_dump_random3.tar.gz"
228239
cmd_str = " ".join(cmd)
229240
if "show techsupport" in cmd_str:
230-
patcher.fs.create_file("/var/dump/sonic_dump_random3.tar.gz")
241+
patcher.fs.create_file(ts_dump)
231242
else:
232243
return 1, "", "Command Not Found"
233-
return 0, "", ""
244+
return 0, AUTO_TS_STDOUT + ts_dump, ""
234245
cdump_mod.subprocess_exec = mock_cmd
235246
patcher.fs.create_file("/var/dump/sonic_dump_random1.tar.gz")
236247
patcher.fs.create_file("/var/core/python3.12345.123.core.gz")
@@ -251,10 +262,11 @@ def test_since_argument(self):
251262
populate_state_db(redis_mock)
252263
with Patcher() as patcher:
253264
def mock_cmd(cmd, env):
265+
ts_dump = "/var/dump/sonic_dump_random3.tar.gz"
254266
cmd_str = " ".join(cmd)
255-
if "show techsupport --since '4 days ago'" in cmd_str:
256-
patcher.fs.create_file("/var/dump/sonic_dump_random3.tar.gz")
257-
return 0, "", ""
267+
if "--since '4 days ago'" in cmd_str:
268+
patcher.fs.create_file(ts_dump)
269+
return 0, AUTO_TS_STDOUT + ts_dump, ""
258270
elif "date --date='4 days ago'" in cmd_str:
259271
return 0, "", ""
260272
else:
@@ -284,12 +296,13 @@ def test_masic_core_dump(self):
284296
populate_state_db(redis_mock)
285297
with Patcher() as patcher:
286298
def mock_cmd(cmd, env):
299+
ts_dump = "/var/dump/sonic_dump_random3.tar.gz"
287300
cmd_str = " ".join(cmd)
288301
if "show techsupport" in cmd_str:
289-
patcher.fs.create_file("/var/dump/sonic_dump_random3.tar.gz")
302+
patcher.fs.create_file(ts_dump)
290303
else:
291304
return 1, "", "Command Not Found"
292-
return 0, "", ""
305+
return 0, AUTO_TS_STDOUT + ts_dump, ""
293306
cdump_mod.subprocess_exec = mock_cmd
294307
patcher.fs.create_file("/var/dump/sonic_dump_random1.tar.gz")
295308
patcher.fs.create_file("/var/dump/sonic_dump_random2.tar.gz")
@@ -315,10 +328,12 @@ def test_invalid_since_argument(self):
315328
populate_state_db(redis_mock)
316329
with Patcher() as patcher:
317330
def mock_cmd(cmd, env):
331+
ts_dump = "/var/dump/sonic_dump_random3.tar.gz"
318332
cmd_str = " ".join(cmd)
319-
if "show techsupport --since '2 days ago'" in cmd_str:
320-
patcher.fs.create_file("/var/dump/sonic_dump_random3.tar.gz")
321-
return 0, "", ""
333+
if "--since '2 days ago'" in cmd_str:
334+
patcher.fs.create_file(ts_dump)
335+
print(AUTO_TS_STDOUT + ts_dump)
336+
return 0, AUTO_TS_STDOUT + ts_dump, ""
322337
elif "date --date='whatever'" in cmd_str:
323338
return 1, "", "Invalid Date Format"
324339
else:
@@ -370,7 +385,7 @@ def mock_cmd(cmd, env):
370385
cmd_str = " ".join(cmd)
371386
if "show techsupport" in cmd_str:
372387
patcher.fs.create_file("/var/dump/sonic_dump_random3.tar.gz")
373-
return 0, "", ""
388+
return 0, AUTO_TS_STDOUT + ts_dump, ""
374389
patcher.fs.set_disk_usage(2000, path="/var/core/")
375390
patcher.fs.create_file("/var/core/orchagent.12345.123.core.gz", st_size=25)
376391
patcher.fs.create_file("/var/core/lldpmgrd.12345.22.core.gz", st_size=25)

utilities_common/auto_techsupport_helper.py

+8-4
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212
"CFG_DB", "AUTO_TS", "CFG_STATE", "CFG_MAX_TS", "COOLOFF",
1313
"CFG_CORE_USAGE", "CFG_SINCE", "FEATURE", "STATE_DB",
1414
"TS_MAP", "CORE_DUMP", "TIMESTAMP", "CONTAINER",
15-
"TIME_BUF", "SINCE_DEFAULT"
15+
"TIME_BUF", "SINCE_DEFAULT", "TS_PTRN_GLOB"
1616
] + [ # Methods
1717
"verify_recent_file_creation",
1818
"get_ts_dumps",
@@ -30,7 +30,9 @@
3030
CORE_DUMP_PTRN = "*.core.gz"
3131

3232
TS_DIR = "/var/dump"
33-
TS_PTRN = "sonic_dump_*.tar*"
33+
TS_ROOT = "sonic_dump_*"
34+
TS_PTRN = "sonic_dump_.*tar.*" # Regex Exp
35+
TS_PTRN_GLOB = "sonic_dump_*tar*" # Glob Exp
3436

3537
# CONFIG DB Attributes
3638
CFG_DB = "CONFIG_DB"
@@ -78,8 +80,10 @@ def strip_ts_ext(ts_path):
7880

7981

8082
def get_ts_dumps(full_path=False):
81-
""" Get the list of TS dumps in the TS_DIR, sorted by the creation time """
82-
curr_list = glob.glob(os.path.join(TS_DIR, TS_PTRN))
83+
"""
84+
Get the list of TS dumps in the TS_DIR, sorted by the creation time
85+
"""
86+
curr_list = glob.glob(os.path.join(TS_DIR, TS_ROOT))
8387
curr_list.sort(key=os.path.getmtime)
8488
if full_path:
8589
return curr_list

0 commit comments

Comments
 (0)