Skip to content

Commit d112f7c

Browse files
[202205][auto-ts] add memory check (#2116) (#2413)
Backport of #2116 - What I did Implemented memory threashold check in auto techsupport feature according to sonic-net/SONiC#939. - How I did it Added two scripts. The check script and the handler script. Few modifications made in auto tech implementation. UT added. - How to verify it Run the action script and the handler script on the switch. Run UT. Signed-off-by: Stepan Blyschak <[email protected]>
1 parent 99ed8ea commit d112f7c

14 files changed

+671
-134
lines changed

config/plugins/auto_techsupport.py

Lines changed: 58 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -228,6 +228,50 @@ def AUTO_TECHSUPPORT_GLOBAL_max_core_limit(db, max_core_limit):
228228
exit_with_error(f"Error: {err}", fg="red")
229229

230230

231+
@AUTO_TECHSUPPORT_GLOBAL.command(name="available-mem-threshold")
232+
@click.argument(
233+
"available-mem-threshold",
234+
nargs=1,
235+
required=True,
236+
)
237+
@clicommon.pass_db
238+
def AUTO_TECHSUPPORT_GLOBAL_available_mem_threshold(db, available_mem_threshold):
239+
""" Memory threshold; 0 to disable techsupport invocation on memory usage threshold crossing.
240+
"""
241+
242+
table = "AUTO_TECHSUPPORT"
243+
key = "GLOBAL"
244+
data = {
245+
"available_mem_threshold": available_mem_threshold,
246+
}
247+
try:
248+
update_entry_validated(db.cfgdb, table, key, data, create_if_not_exists=True)
249+
except Exception as err:
250+
exit_with_error(f"Error: {err}", fg="red")
251+
252+
253+
@AUTO_TECHSUPPORT_GLOBAL.command(name="min-available-mem")
254+
@click.argument(
255+
"min-available-mem",
256+
nargs=1,
257+
required=True,
258+
)
259+
@clicommon.pass_db
260+
def AUTO_TECHSUPPORT_GLOBAL_min_available_mem(db, min_available_mem):
261+
""" Minimum free memory amount in Kb when techsupport will be executed.
262+
"""
263+
264+
table = "AUTO_TECHSUPPORT"
265+
key = "GLOBAL"
266+
data = {
267+
"min_available_mem": min_available_mem,
268+
}
269+
try:
270+
update_entry_validated(db.cfgdb, table, key, data, create_if_not_exists=True)
271+
except Exception as err:
272+
exit_with_error(f"Error: {err}", fg="red")
273+
274+
231275
@AUTO_TECHSUPPORT_GLOBAL.command(name="since")
232276
@click.argument(
233277
"since",
@@ -271,8 +315,12 @@ def AUTO_TECHSUPPORT_FEATURE():
271315
"--rate-limit-interval",
272316
help="Rate limit interval for the corresponding feature. Configure 0 to explicitly disable",
273317
)
318+
@click.option(
319+
"--available-mem-threshold",
320+
help="Memory threshold; 0 to disable techsupport invocation on memory usage threshold crossing.",
321+
)
274322
@clicommon.pass_db
275-
def AUTO_TECHSUPPORT_FEATURE_add(db, feature_name, state, rate_limit_interval):
323+
def AUTO_TECHSUPPORT_FEATURE_add(db, feature_name, state, rate_limit_interval, available_mem_threshold):
276324
""" Add object in AUTO_TECHSUPPORT_FEATURE. """
277325

278326
table = "AUTO_TECHSUPPORT_FEATURE"
@@ -282,6 +330,8 @@ def AUTO_TECHSUPPORT_FEATURE_add(db, feature_name, state, rate_limit_interval):
282330
data["state"] = state
283331
if rate_limit_interval is not None:
284332
data["rate_limit_interval"] = rate_limit_interval
333+
if available_mem_threshold is not None:
334+
data["available_mem_threshold"] = available_mem_threshold
285335

286336
try:
287337
add_entry_validated(db.cfgdb, table, key, data)
@@ -303,8 +353,12 @@ def AUTO_TECHSUPPORT_FEATURE_add(db, feature_name, state, rate_limit_interval):
303353
"--rate-limit-interval",
304354
help="Rate limit interval for the corresponding feature. Configure 0 to explicitly disable",
305355
)
356+
@click.option(
357+
"--available-mem-threshold",
358+
help="Memory threshold; 0 to disable techsupport invocation on memory usage threshold crossing.",
359+
)
306360
@clicommon.pass_db
307-
def AUTO_TECHSUPPORT_FEATURE_update(db, feature_name, state, rate_limit_interval):
361+
def AUTO_TECHSUPPORT_FEATURE_update(db, feature_name, state, rate_limit_interval, available_mem_threshold):
308362
""" Add object in AUTO_TECHSUPPORT_FEATURE. """
309363

310364
table = "AUTO_TECHSUPPORT_FEATURE"
@@ -314,6 +368,8 @@ def AUTO_TECHSUPPORT_FEATURE_update(db, feature_name, state, rate_limit_interval
314368
data["state"] = state
315369
if rate_limit_interval is not None:
316370
data["rate_limit_interval"] = rate_limit_interval
371+
if available_mem_threshold is not None:
372+
data["available_mem_threshold"] = available_mem_threshold
317373

318374
try:
319375
update_entry_validated(db.cfgdb, table, key, data)

scripts/coredump_gen_handler.py

Lines changed: 2 additions & 115 deletions
Original file line numberDiff line numberDiff line change
@@ -5,18 +5,11 @@
55
For more info, refer to the Event Driven TechSupport & CoreDump Mgmt HLD
66
"""
77
import os
8-
import time
98
import argparse
109
import syslog
11-
import re
1210
from swsscommon.swsscommon import SonicV2Connector
1311
from utilities_common.auto_techsupport_helper import *
1412

15-
# Explicity Pass this to the subprocess invoking techsupport
16-
ENV_VAR = os.environ
17-
PATH_PREV = ENV_VAR["PATH"] if "PATH" in ENV_VAR else ""
18-
ENV_VAR["PATH"] = "/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:" + PATH_PREV
19-
2013

2114
def handle_coredump_cleanup(dump_name, db):
2215
_, num_bytes = get_stats(os.path.join(CORE_DUMP_DIR, CORE_DUMP_PTRN))
@@ -49,8 +42,6 @@ def __init__(self, core_name, container_name, db):
4942
self.core_name = core_name
5043
self.container = container_name
5144
self.db = db
52-
self.proc_mp = {}
53-
self.core_ts_map = {}
5445

5546
def handle_core_dump_creation_event(self):
5647
if self.db.get(CFG_DB, AUTO_TS, CFG_STATE) != "enabled":
@@ -66,112 +57,8 @@ def handle_core_dump_creation_event(self):
6657
syslog.syslog(syslog.LOG_NOTICE, msg.format(self.container, self.core_name))
6758
return
6859

69-
global_cooloff = self.db.get(CFG_DB, AUTO_TS, COOLOFF)
70-
container_cooloff = self.db.get(CFG_DB, FEATURE_KEY, COOLOFF)
71-
72-
try:
73-
global_cooloff = float(global_cooloff)
74-
except ValueError:
75-
global_cooloff = 0.0
76-
77-
try:
78-
container_cooloff = float(container_cooloff)
79-
except ValueError:
80-
container_cooloff = 0.0
81-
82-
cooloff_passed = self.verify_rate_limit_intervals(global_cooloff, container_cooloff)
83-
if cooloff_passed:
84-
since_cfg = self.get_since_arg()
85-
new_file = self.invoke_ts_cmd(since_cfg)
86-
if new_file:
87-
self.write_to_state_db(int(time.time()), new_file)
88-
89-
def write_to_state_db(self, timestamp, ts_dump):
90-
name = strip_ts_ext(ts_dump)
91-
key = TS_MAP + "|" + name
92-
self.db.set(STATE_DB, key, CORE_DUMP, self.core_name)
93-
self.db.set(STATE_DB, key, TIMESTAMP, str(timestamp))
94-
self.db.set(STATE_DB, key, CONTAINER, self.container)
95-
96-
def get_since_arg(self):
97-
since_cfg = self.db.get(CFG_DB, AUTO_TS, CFG_SINCE)
98-
if not since_cfg:
99-
return SINCE_DEFAULT
100-
rc, _, stderr = subprocess_exec(["date", "--date={}".format(since_cfg)], env=ENV_VAR)
101-
if rc == 0:
102-
return since_cfg
103-
return SINCE_DEFAULT
104-
105-
def parse_ts_dump_name(self, ts_stdout):
106-
""" Figure out the ts_dump name from the techsupport stdout """
107-
matches = re.findall(TS_PTRN, ts_stdout)
108-
if matches:
109-
return matches[-1]
110-
syslog.syslog(syslog.LOG_ERR, "stdout of the 'show techsupport' cmd doesn't have the dump name")
111-
return ""
112-
113-
def invoke_ts_cmd(self, since_cfg, num_retry=0):
114-
cmd_opts = ["show", "techsupport", "--silent", "--global-timeout", TS_GLOBAL_TIMEOUT, "--since", since_cfg]
115-
cmd = " ".join(cmd_opts)
116-
rc, stdout, stderr = subprocess_exec(cmd_opts, env=ENV_VAR)
117-
new_dump = ""
118-
if rc == EXT_LOCKFAIL:
119-
syslog.syslog(syslog.LOG_NOTICE, "Another instance of techsupport running, aborting this. stderr: {}".format(stderr))
120-
elif rc == EXT_RETRY:
121-
if num_retry <= MAX_RETRY_LIMIT:
122-
return self.invoke_ts_cmd(since_cfg, num_retry+1)
123-
else:
124-
syslog.syslog(syslog.LOG_ERR, "MAX_RETRY_LIMIT for show techsupport invocation exceeded, stderr: {}".format(stderr))
125-
elif rc != EXT_SUCCESS:
126-
syslog.syslog(syslog.LOG_ERR, "show techsupport failed with exit code {}, stderr: {}".format(rc, stderr))
127-
else: # EXT_SUCCESS
128-
new_dump = self.parse_ts_dump_name(stdout) # Parse the dump name
129-
if not new_dump:
130-
syslog.syslog(syslog.LOG_ERR, "{} was run, but no techsupport dump is found".format(cmd))
131-
else:
132-
syslog.syslog(syslog.LOG_INFO, "{} is successful, {} is created".format(cmd, new_dump))
133-
return new_dump
134-
135-
def verify_rate_limit_intervals(self, global_cooloff, container_cooloff):
136-
"""Verify both the global and per-proc rate_limit_intervals have passed"""
137-
curr_ts_list = get_ts_dumps(True)
138-
if global_cooloff and curr_ts_list:
139-
last_ts_dump_creation = os.path.getmtime(curr_ts_list[-1])
140-
if time.time() - last_ts_dump_creation < global_cooloff:
141-
msg = "Global rate_limit_interval period has not passed. Techsupport Invocation is skipped. Core: {}"
142-
syslog.syslog(syslog.LOG_INFO, msg.format(self.core_name))
143-
return False
144-
145-
self.parse_ts_map()
146-
if container_cooloff and self.container in self.core_ts_map:
147-
last_creation_time = self.core_ts_map[self.container][0][0]
148-
if time.time() - last_creation_time < container_cooloff:
149-
msg = "Per Container rate_limit_interval for {} has not passed. Techsupport Invocation is skipped. Core: {}"
150-
syslog.syslog(syslog.LOG_INFO, msg.format(self.container, self.core_name))
151-
return False
152-
return True
153-
154-
def parse_ts_map(self):
155-
"""Create proc_name, ts_dump & creation_time map"""
156-
ts_keys = self.db.keys(STATE_DB, TS_MAP+"*")
157-
if not ts_keys:
158-
return
159-
for ts_key in ts_keys:
160-
data = self.db.get_all(STATE_DB, ts_key)
161-
if not data:
162-
continue
163-
container_name = data.get(CONTAINER, "")
164-
creation_time = data.get(TIMESTAMP, "")
165-
try:
166-
creation_time = int(creation_time)
167-
except Exception:
168-
continue # if the creation time is invalid, skip the entry
169-
ts_dump = ts_key.split("|")[-1]
170-
if container_name and container_name not in self.core_ts_map:
171-
self.core_ts_map[container_name] = []
172-
self.core_ts_map[container_name].append((int(creation_time), ts_dump))
173-
for container_name in self.core_ts_map:
174-
self.core_ts_map[container_name].sort()
60+
invoke_ts_command_rate_limited(self.db, EVENT_TYPE_CORE, {CORE_DUMP: self.core_name}, self.container)
61+
17562

17663
def main():
17764
parser = argparse.ArgumentParser(description='Auto Techsupport Invocation and CoreDump Mgmt Script')

0 commit comments

Comments
 (0)