5
5
For more info, refer to the Event Driven TechSupport & CoreDump Mgmt HLD
6
6
"""
7
7
import os
8
- import time
9
8
import argparse
10
9
import syslog
11
- import re
12
10
from swsscommon .swsscommon import SonicV2Connector
13
11
from utilities_common .auto_techsupport_helper import *
14
12
15
- # Explicity Pass this to the subprocess invoking techsupport
16
- ENV_VAR = os .environ
17
- PATH_PREV = ENV_VAR ["PATH" ] if "PATH" in ENV_VAR else ""
18
- ENV_VAR ["PATH" ] = "/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:" + PATH_PREV
19
-
20
13
21
14
def handle_coredump_cleanup (dump_name , db ):
22
15
_ , num_bytes = get_stats (os .path .join (CORE_DUMP_DIR , CORE_DUMP_PTRN ))
@@ -49,8 +42,6 @@ def __init__(self, core_name, container_name, db):
49
42
self .core_name = core_name
50
43
self .container = container_name
51
44
self .db = db
52
- self .proc_mp = {}
53
- self .core_ts_map = {}
54
45
55
46
def handle_core_dump_creation_event (self ):
56
47
if self .db .get (CFG_DB , AUTO_TS , CFG_STATE ) != "enabled" :
@@ -66,112 +57,8 @@ def handle_core_dump_creation_event(self):
66
57
syslog .syslog (syslog .LOG_NOTICE , msg .format (self .container , self .core_name ))
67
58
return
68
59
69
- global_cooloff = self .db .get (CFG_DB , AUTO_TS , COOLOFF )
70
- container_cooloff = self .db .get (CFG_DB , FEATURE_KEY , COOLOFF )
71
-
72
- try :
73
- global_cooloff = float (global_cooloff )
74
- except ValueError :
75
- global_cooloff = 0.0
76
-
77
- try :
78
- container_cooloff = float (container_cooloff )
79
- except ValueError :
80
- container_cooloff = 0.0
81
-
82
- cooloff_passed = self .verify_rate_limit_intervals (global_cooloff , container_cooloff )
83
- if cooloff_passed :
84
- since_cfg = self .get_since_arg ()
85
- new_file = self .invoke_ts_cmd (since_cfg )
86
- if new_file :
87
- self .write_to_state_db (int (time .time ()), new_file )
88
-
89
- def write_to_state_db (self , timestamp , ts_dump ):
90
- name = strip_ts_ext (ts_dump )
91
- key = TS_MAP + "|" + name
92
- self .db .set (STATE_DB , key , CORE_DUMP , self .core_name )
93
- self .db .set (STATE_DB , key , TIMESTAMP , str (timestamp ))
94
- self .db .set (STATE_DB , key , CONTAINER , self .container )
95
-
96
- def get_since_arg (self ):
97
- since_cfg = self .db .get (CFG_DB , AUTO_TS , CFG_SINCE )
98
- if not since_cfg :
99
- return SINCE_DEFAULT
100
- rc , _ , stderr = subprocess_exec (["date" , "--date={}" .format (since_cfg )], env = ENV_VAR )
101
- if rc == 0 :
102
- return since_cfg
103
- return SINCE_DEFAULT
104
-
105
- def parse_ts_dump_name (self , ts_stdout ):
106
- """ Figure out the ts_dump name from the techsupport stdout """
107
- matches = re .findall (TS_PTRN , ts_stdout )
108
- if matches :
109
- return matches [- 1 ]
110
- syslog .syslog (syslog .LOG_ERR , "stdout of the 'show techsupport' cmd doesn't have the dump name" )
111
- return ""
112
-
113
- def invoke_ts_cmd (self , since_cfg , num_retry = 0 ):
114
- cmd_opts = ["show" , "techsupport" , "--silent" , "--global-timeout" , TS_GLOBAL_TIMEOUT , "--since" , since_cfg ]
115
- cmd = " " .join (cmd_opts )
116
- rc , stdout , stderr = subprocess_exec (cmd_opts , env = ENV_VAR )
117
- new_dump = ""
118
- if rc == EXT_LOCKFAIL :
119
- syslog .syslog (syslog .LOG_NOTICE , "Another instance of techsupport running, aborting this. stderr: {}" .format (stderr ))
120
- elif rc == EXT_RETRY :
121
- if num_retry <= MAX_RETRY_LIMIT :
122
- return self .invoke_ts_cmd (since_cfg , num_retry + 1 )
123
- else :
124
- syslog .syslog (syslog .LOG_ERR , "MAX_RETRY_LIMIT for show techsupport invocation exceeded, stderr: {}" .format (stderr ))
125
- elif rc != EXT_SUCCESS :
126
- syslog .syslog (syslog .LOG_ERR , "show techsupport failed with exit code {}, stderr: {}" .format (rc , stderr ))
127
- else : # EXT_SUCCESS
128
- new_dump = self .parse_ts_dump_name (stdout ) # Parse the dump name
129
- if not new_dump :
130
- syslog .syslog (syslog .LOG_ERR , "{} was run, but no techsupport dump is found" .format (cmd ))
131
- else :
132
- syslog .syslog (syslog .LOG_INFO , "{} is successful, {} is created" .format (cmd , new_dump ))
133
- return new_dump
134
-
135
- def verify_rate_limit_intervals (self , global_cooloff , container_cooloff ):
136
- """Verify both the global and per-proc rate_limit_intervals have passed"""
137
- curr_ts_list = get_ts_dumps (True )
138
- if global_cooloff and curr_ts_list :
139
- last_ts_dump_creation = os .path .getmtime (curr_ts_list [- 1 ])
140
- if time .time () - last_ts_dump_creation < global_cooloff :
141
- msg = "Global rate_limit_interval period has not passed. Techsupport Invocation is skipped. Core: {}"
142
- syslog .syslog (syslog .LOG_INFO , msg .format (self .core_name ))
143
- return False
144
-
145
- self .parse_ts_map ()
146
- if container_cooloff and self .container in self .core_ts_map :
147
- last_creation_time = self .core_ts_map [self .container ][0 ][0 ]
148
- if time .time () - last_creation_time < container_cooloff :
149
- msg = "Per Container rate_limit_interval for {} has not passed. Techsupport Invocation is skipped. Core: {}"
150
- syslog .syslog (syslog .LOG_INFO , msg .format (self .container , self .core_name ))
151
- return False
152
- return True
153
-
154
- def parse_ts_map (self ):
155
- """Create proc_name, ts_dump & creation_time map"""
156
- ts_keys = self .db .keys (STATE_DB , TS_MAP + "*" )
157
- if not ts_keys :
158
- return
159
- for ts_key in ts_keys :
160
- data = self .db .get_all (STATE_DB , ts_key )
161
- if not data :
162
- continue
163
- container_name = data .get (CONTAINER , "" )
164
- creation_time = data .get (TIMESTAMP , "" )
165
- try :
166
- creation_time = int (creation_time )
167
- except Exception :
168
- continue # if the creation time is invalid, skip the entry
169
- ts_dump = ts_key .split ("|" )[- 1 ]
170
- if container_name and container_name not in self .core_ts_map :
171
- self .core_ts_map [container_name ] = []
172
- self .core_ts_map [container_name ].append ((int (creation_time ), ts_dump ))
173
- for container_name in self .core_ts_map :
174
- self .core_ts_map [container_name ].sort ()
60
+ invoke_ts_command_rate_limited (self .db , EVENT_TYPE_CORE , {CORE_DUMP : self .core_name }, self .container )
61
+
175
62
176
63
def main ():
177
64
parser = argparse .ArgumentParser (description = 'Auto Techsupport Invocation and CoreDump Mgmt Script' )
0 commit comments