Skip to content

Commit a3e34e3

Browse files
authored
[Auto Techsupport] Event driven Techsupport Changes (sonic-net#1796)
#### What I did sonic-utilities changes required for feature "Event Driven TechSupport Invocation & CoreDump Mgmt". [HLD](sonic-net/SONiC#818 ) Summary of the changes: - Added the AUTO GEN CLI for the CFG DB tables required for this feature - Added the coredump_gen_handler.py & techsupport_cleanup.py scripts. - Added the UT's required for these scripts. - Enhanced coredump-compress & generate-dump scripts
1 parent efa2ff6 commit a3e34e3

10 files changed

+1458
-0
lines changed

config/plugins/auto_techsupport.py

+350
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,350 @@
1+
"""
2+
Autogenerated config CLI plugin.
3+
"""
4+
5+
import click
6+
import utilities_common.cli as clicommon
7+
import utilities_common.general as general
8+
from config import config_mgmt
9+
10+
11+
# Load sonic-cfggen from source since /usr/local/bin/sonic-cfggen does not have .py extension.
12+
sonic_cfggen = general.load_module_from_source('sonic_cfggen', '/usr/local/bin/sonic-cfggen')
13+
14+
15+
def exit_with_error(*args, **kwargs):
16+
""" Print a message and abort CLI. """
17+
18+
click.secho(*args, **kwargs)
19+
raise click.Abort()
20+
21+
22+
def validate_config_or_raise(cfg):
23+
""" Validate config db data using ConfigMgmt """
24+
25+
try:
26+
cfg = sonic_cfggen.FormatConverter.to_serialized(cfg)
27+
config_mgmt.ConfigMgmt().loadData(cfg)
28+
except Exception as err:
29+
raise Exception('Failed to validate configuration: {}'.format(err))
30+
31+
32+
def add_entry_validated(db, table, key, data):
33+
""" Add new entry in table and validate configuration """
34+
35+
cfg = db.get_config()
36+
cfg.setdefault(table, {})
37+
if key in cfg[table]:
38+
raise Exception(f"{key} already exists")
39+
40+
cfg[table][key] = data
41+
42+
validate_config_or_raise(cfg)
43+
db.set_entry(table, key, data)
44+
45+
46+
def update_entry_validated(db, table, key, data, create_if_not_exists=False):
47+
""" Update entry in table and validate configuration.
48+
If attribute value in data is None, the attribute is deleted.
49+
"""
50+
51+
cfg = db.get_config()
52+
cfg.setdefault(table, {})
53+
54+
if create_if_not_exists:
55+
cfg[table].setdefault(key, {})
56+
57+
if key not in cfg[table]:
58+
raise Exception(f"{key} does not exist")
59+
60+
for attr, value in data.items():
61+
if value is None and attr in cfg[table][key]:
62+
cfg[table][key].pop(attr)
63+
else:
64+
cfg[table][key][attr] = value
65+
66+
validate_config_or_raise(cfg)
67+
db.set_entry(table, key, cfg[table][key])
68+
69+
70+
def del_entry_validated(db, table, key):
71+
""" Delete entry in table and validate configuration """
72+
73+
cfg = db.get_config()
74+
cfg.setdefault(table, {})
75+
if key not in cfg[table]:
76+
raise Exception(f"{key} does not exist")
77+
78+
cfg[table].pop(key)
79+
80+
validate_config_or_raise(cfg)
81+
db.set_entry(table, key, None)
82+
83+
84+
def add_list_entry_validated(db, table, key, attr, data):
85+
""" Add new entry into list in table and validate configuration"""
86+
87+
cfg = db.get_config()
88+
cfg.setdefault(table, {})
89+
if key not in cfg[table]:
90+
raise Exception(f"{key} does not exist")
91+
cfg[table][key].setdefault(attr, [])
92+
for entry in data:
93+
if entry in cfg[table][key][attr]:
94+
raise Exception(f"{entry} already exists")
95+
cfg[table][key][attr].append(entry)
96+
97+
validate_config_or_raise(cfg)
98+
db.set_entry(table, key, cfg[table][key])
99+
100+
101+
def del_list_entry_validated(db, table, key, attr, data):
102+
""" Delete entry from list in table and validate configuration"""
103+
104+
cfg = db.get_config()
105+
cfg.setdefault(table, {})
106+
if key not in cfg[table]:
107+
raise Exception(f"{key} does not exist")
108+
cfg[table][key].setdefault(attr, [])
109+
for entry in data:
110+
if entry not in cfg[table][key][attr]:
111+
raise Exception(f"{entry} does not exist")
112+
cfg[table][key][attr].remove(entry)
113+
if not cfg[table][key][attr]:
114+
cfg[table][key].pop(attr)
115+
116+
validate_config_or_raise(cfg)
117+
db.set_entry(table, key, cfg[table][key])
118+
119+
120+
def clear_list_entry_validated(db, table, key, attr):
121+
""" Clear list in object and validate configuration"""
122+
123+
update_entry_validated(db, table, key, {attr: None})
124+
125+
126+
@click.group(name="auto-techsupport",
127+
cls=clicommon.AliasedGroup)
128+
def AUTO_TECHSUPPORT():
129+
""" AUTO_TECHSUPPORT part of config_db.json """
130+
131+
pass
132+
133+
134+
@AUTO_TECHSUPPORT.group(name="global",
135+
cls=clicommon.AliasedGroup)
136+
@clicommon.pass_db
137+
def AUTO_TECHSUPPORT_GLOBAL(db):
138+
""" """
139+
140+
pass
141+
142+
143+
@AUTO_TECHSUPPORT_GLOBAL.command(name="state")
144+
@click.argument(
145+
"state",
146+
nargs=1,
147+
required=True,
148+
)
149+
@clicommon.pass_db
150+
def AUTO_TECHSUPPORT_GLOBAL_state(db, state):
151+
""" Knob to make techsupport invocation event-driven based on core-dump generation """
152+
153+
table = "AUTO_TECHSUPPORT"
154+
key = "GLOBAL"
155+
data = {
156+
"state": state,
157+
}
158+
try:
159+
update_entry_validated(db.cfgdb, table, key, data, create_if_not_exists=True)
160+
except Exception as err:
161+
exit_with_error(f"Error: {err}", fg="red")
162+
163+
164+
@AUTO_TECHSUPPORT_GLOBAL.command(name="rate-limit-interval")
165+
@click.argument(
166+
"rate-limit-interval",
167+
nargs=1,
168+
required=True,
169+
)
170+
@clicommon.pass_db
171+
def AUTO_TECHSUPPORT_GLOBAL_rate_limit_interval(db, rate_limit_interval):
172+
""" Minimum time in seconds between two successive techsupport invocations. Configure 0 to explicitly disable """
173+
174+
table = "AUTO_TECHSUPPORT"
175+
key = "GLOBAL"
176+
data = {
177+
"rate_limit_interval": rate_limit_interval,
178+
}
179+
try:
180+
update_entry_validated(db.cfgdb, table, key, data, create_if_not_exists=True)
181+
except Exception as err:
182+
exit_with_error(f"Error: {err}", fg="red")
183+
184+
185+
@AUTO_TECHSUPPORT_GLOBAL.command(name="max-techsupport-limit")
186+
@click.argument(
187+
"max-techsupport-limit",
188+
nargs=1,
189+
required=True,
190+
)
191+
@clicommon.pass_db
192+
def AUTO_TECHSUPPORT_GLOBAL_max_techsupport_limit(db, max_techsupport_limit):
193+
""" Max Limit in percentage for the cummulative size of ts dumps.
194+
No cleanup is performed if the value isn't configured or is 0.0
195+
"""
196+
197+
table = "AUTO_TECHSUPPORT"
198+
key = "GLOBAL"
199+
data = {
200+
"max_techsupport_limit": max_techsupport_limit,
201+
}
202+
try:
203+
update_entry_validated(db.cfgdb, table, key, data, create_if_not_exists=True)
204+
except Exception as err:
205+
exit_with_error(f"Error: {err}", fg="red")
206+
207+
208+
@AUTO_TECHSUPPORT_GLOBAL.command(name="max-core-limit")
209+
@click.argument(
210+
"max-core-limit",
211+
nargs=1,
212+
required=True,
213+
)
214+
@clicommon.pass_db
215+
def AUTO_TECHSUPPORT_GLOBAL_max_core_limit(db, max_core_limit):
216+
""" Max Limit in percentage for the cummulative size of core dumps.
217+
No cleanup is performed if the value isn't congiured or is 0.0
218+
"""
219+
220+
table = "AUTO_TECHSUPPORT"
221+
key = "GLOBAL"
222+
data = {
223+
"max_core_limit": max_core_limit,
224+
}
225+
try:
226+
update_entry_validated(db.cfgdb, table, key, data, create_if_not_exists=True)
227+
except Exception as err:
228+
exit_with_error(f"Error: {err}", fg="red")
229+
230+
231+
@AUTO_TECHSUPPORT_GLOBAL.command(name="since")
232+
@click.argument(
233+
"since",
234+
nargs=1,
235+
required=True,
236+
)
237+
@clicommon.pass_db
238+
def AUTO_TECHSUPPORT_GLOBAL_since(db, since):
239+
""" Only collect the logs & core-dumps generated since the time provided.
240+
A default value of '2 days ago' is used if this value is not set explicitly or a non-valid string is provided """
241+
242+
table = "AUTO_TECHSUPPORT"
243+
key = "GLOBAL"
244+
data = {
245+
"since": since,
246+
}
247+
try:
248+
update_entry_validated(db.cfgdb, table, key, data, create_if_not_exists=True)
249+
except Exception as err:
250+
exit_with_error(f"Error: {err}", fg="red")
251+
252+
253+
@click.group(name="auto-techsupport-feature",
254+
cls=clicommon.AliasedGroup)
255+
def AUTO_TECHSUPPORT_FEATURE():
256+
""" AUTO_TECHSUPPORT_FEATURE part of config_db.json """
257+
pass
258+
259+
260+
@AUTO_TECHSUPPORT_FEATURE.command(name="add")
261+
@click.argument(
262+
"feature-name",
263+
nargs=1,
264+
required=True,
265+
)
266+
@click.option(
267+
"--state",
268+
help="Enable auto techsupport invocation on the processes running inside this feature",
269+
)
270+
@click.option(
271+
"--rate-limit-interval",
272+
help="Rate limit interval for the corresponding feature. Configure 0 to explicitly disable",
273+
)
274+
@clicommon.pass_db
275+
def AUTO_TECHSUPPORT_FEATURE_add(db, feature_name, state, rate_limit_interval):
276+
""" Add object in AUTO_TECHSUPPORT_FEATURE. """
277+
278+
table = "AUTO_TECHSUPPORT_FEATURE"
279+
key = feature_name
280+
data = {}
281+
if state is not None:
282+
data["state"] = state
283+
if rate_limit_interval is not None:
284+
data["rate_limit_interval"] = rate_limit_interval
285+
286+
try:
287+
add_entry_validated(db.cfgdb, table, key, data)
288+
except Exception as err:
289+
exit_with_error(f"Error: {err}", fg="red")
290+
291+
292+
@AUTO_TECHSUPPORT_FEATURE.command(name="update")
293+
@click.argument(
294+
"feature-name",
295+
nargs=1,
296+
required=True,
297+
)
298+
@click.option(
299+
"--state",
300+
help="Enable auto techsupport invocation on the processes running inside this feature",
301+
)
302+
@click.option(
303+
"--rate-limit-interval",
304+
help="Rate limit interval for the corresponding feature. Configure 0 to explicitly disable",
305+
)
306+
@clicommon.pass_db
307+
def AUTO_TECHSUPPORT_FEATURE_update(db, feature_name, state, rate_limit_interval):
308+
""" Add object in AUTO_TECHSUPPORT_FEATURE. """
309+
310+
table = "AUTO_TECHSUPPORT_FEATURE"
311+
key = feature_name
312+
data = {}
313+
if state is not None:
314+
data["state"] = state
315+
if rate_limit_interval is not None:
316+
data["rate_limit_interval"] = rate_limit_interval
317+
318+
try:
319+
update_entry_validated(db.cfgdb, table, key, data)
320+
except Exception as err:
321+
exit_with_error(f"Error: {err}", fg="red")
322+
323+
324+
@AUTO_TECHSUPPORT_FEATURE.command(name="delete")
325+
@click.argument(
326+
"feature-name",
327+
nargs=1,
328+
required=True,
329+
)
330+
@clicommon.pass_db
331+
def AUTO_TECHSUPPORT_FEATURE_delete(db, feature_name):
332+
""" Delete object in AUTO_TECHSUPPORT_FEATURE. """
333+
334+
table = "AUTO_TECHSUPPORT_FEATURE"
335+
key = feature_name
336+
try:
337+
del_entry_validated(db.cfgdb, table, key)
338+
except Exception as err:
339+
exit_with_error(f"Error: {err}", fg="red")
340+
341+
342+
def register(cli):
343+
cli_node = AUTO_TECHSUPPORT
344+
if cli_node.name in cli.commands:
345+
raise Exception(f"{cli_node.name} already exists in CLI")
346+
cli.add_command(AUTO_TECHSUPPORT)
347+
cli_node = AUTO_TECHSUPPORT_FEATURE
348+
if cli_node.name in cli.commands:
349+
raise Exception(f"{cli_node.name} already exists in CLI")
350+
cli.add_command(AUTO_TECHSUPPORT_FEATURE)

scripts/coredump-compress

+17
Original file line numberDiff line numberDiff line change
@@ -7,11 +7,28 @@ while [[ $# > 1 ]]; do
77
shift
88
done
99

10+
CONTAINER_ID=""
1011
if [ $# > 0 ]; then
12+
CONTAINER_ID=$(xargs -0 -L1 -a /proc/${1}/cgroup | grep -oP "pids:/docker/\K\w+")
1113
ns=`xargs -0 -L1 -a /proc/${1}/environ | grep -e "^NAMESPACE_ID" | cut -f2 -d'='`
1214
if [ ! -z ${ns} ]; then
1315
PREFIX=${PREFIX}${ns}.
1416
fi
1517
fi
1618

1719
/bin/gzip -1 - > /var/core/${PREFIX}core.gz
20+
21+
if [[ ! -z $CONTAINER_ID ]]; then
22+
CONTAINER_NAME=$(docker inspect --format='{{.Name}}' ${CONTAINER_ID} | cut -c2-)
23+
if [[ ! -z ${CONTAINER_NAME} ]]; then
24+
# coredump_gen_handler invokes techsupport if all the other required conditions are met
25+
# explicitly passing in the env vars because coredump-compress's namespace doesn't have these set by default
26+
for path in $(find /usr/local/lib/python3*/dist-packages -maxdepth 0); do
27+
PYTHONPATH=$PYTHONPATH:$path
28+
done
29+
setsid $(echo > /tmp/coredump_gen_handler.log;
30+
export PYTHONPATH=$PYTHONPATH;
31+
python3 /usr/local/bin/coredump_gen_handler.py ${PREFIX}core.gz ${CONTAINER_NAME} &>> /tmp/coredump_gen_handler.log) &
32+
fi
33+
fi
34+

0 commit comments

Comments
 (0)