@@ -14,12 +14,6 @@ from swsscommon import swsscommon
14
14
15
15
from supervisor import childutils
16
16
17
- # Each line of this file should specify one process, (as defined in supervisord.conf file), in the
18
- # following format:
19
- #
20
- # program:<process_name>
21
- WATCH_PROCESSES_FILE = '/etc/supervisor/watchdog_processes'
22
-
23
17
# Each line of this file should specify either one critical process or one
24
18
# critical process group, (as defined in supervisord.conf file), in the
25
19
# following format:
@@ -40,40 +34,40 @@ ALERTING_INTERVAL_SECS = 60
40
34
EVENTS_PUBLISHER_SOURCE = "sonic-events-host"
41
35
EVENTS_PUBLISHER_TAG = "process-exited-unexpectedly"
42
36
43
- def get_group_and_process_list ( process_file ):
37
+ def get_critical_group_and_process_list ( ):
44
38
"""
45
- @summary: Read the critical processes/group names.
39
+ @summary: Read the critical processes/group names from CRITICAL_PROCESSES_FILE .
46
40
@return: Two lists which contain critical processes and group names respectively.
47
41
"""
48
- group_list = []
49
- process_list = []
42
+ critical_group_list = []
43
+ critical_process_list = []
50
44
51
- with open (process_file , 'r' ) as file :
45
+ with open (CRITICAL_PROCESSES_FILE , 'r' ) as file :
52
46
for line in file :
53
47
# ignore blank lines
54
48
if re .match (r"^\s*$" , line ):
55
49
continue
56
50
line_info = line .strip (' \n ' ).split (':' )
57
51
if len (line_info ) != 2 :
58
52
syslog .syslog (syslog .LOG_ERR ,
59
- "Syntax of the line {} in processes file is incorrect. Exiting..." .format (line ))
53
+ "Syntax of the line {} in critical_processes file is incorrect. Exiting..." .format (line ))
60
54
sys .exit (5 )
61
55
62
56
identifier_key = line_info [0 ].strip ()
63
57
identifier_value = line_info [1 ].strip ()
64
58
if identifier_key == "group" and identifier_value :
65
- group_list .append (identifier_value )
59
+ critical_group_list .append (identifier_value )
66
60
elif identifier_key == "program" and identifier_value :
67
- process_list .append (identifier_value )
61
+ critical_process_list .append (identifier_value )
68
62
else :
69
63
syslog .syslog (syslog .LOG_ERR ,
70
- "Syntax of the line {} in processes file is incorrect. Exiting..." .format (line ))
64
+ "Syntax of the line {} in critical_processes file is incorrect. Exiting..." .format (line ))
71
65
sys .exit (6 )
72
66
73
- return group_list , process_list
67
+ return critical_group_list , critical_process_list
74
68
75
69
76
- def generate_alerting_message (process_name , status , dead_minutes ):
70
+ def generate_alerting_message (process_name , dead_minutes ):
77
71
"""
78
72
@summary: If a critical process was not running, this function will determine it resides in host
79
73
or in a specific namespace. Then an alerting message will be written into syslog.
@@ -86,8 +80,8 @@ def generate_alerting_message(process_name, status, dead_minutes):
86
80
else :
87
81
namespace = namespace_prefix + namespace_id
88
82
89
- syslog .syslog (syslog .LOG_ERR , "Process '{}' is {} in namespace '{}' ({} minutes)."
90
- .format (process_name , status , namespace , dead_minutes ))
83
+ syslog .syslog (syslog .LOG_ERR , "Process '{}' is not running in namespace '{}' ({} minutes)."
84
+ .format (process_name , namespace , dead_minutes ))
91
85
92
86
93
87
def get_autorestart_state (container_name ):
@@ -131,11 +125,9 @@ def main(argv):
131
125
syslog .syslog (syslog .LOG_ERR , "Container name not specified. Exiting..." )
132
126
sys .exit (1 )
133
127
134
- critical_group_list , critical_process_list = get_group_and_process_list (CRITICAL_PROCESSES_FILE )
135
- _ , watch_process_list = get_group_and_process_list (WATCH_PROCESSES_FILE )
128
+ critical_group_list , critical_process_list = get_critical_group_and_process_list ()
136
129
137
130
process_under_alerting = defaultdict (dict )
138
- process_heart_beat_info = defaultdict (dict )
139
131
# Transition from ACKNOWLEDGED to READY
140
132
childutils .listener .ready ()
141
133
events_handle = swsscommon .events_init_publisher (EVENTS_PUBLISHER_SOURCE )
@@ -175,15 +167,6 @@ def main(argv):
175
167
if process_name in process_under_alerting :
176
168
process_under_alerting .pop (process_name )
177
169
178
- # Handle the PROCESS_COMMUNICATION_STDOUT event
179
- elif headers ['eventname' ] == 'PROCESS_COMMUNICATION_STDOUT' :
180
- payload_headers , payload_data = childutils .eventdata (payload + '\n ' )
181
- process_name = payload_headers ['processname' ]
182
-
183
- # update process heart beat time
184
- if (process_name in watch_process_list ):
185
- process_heart_beat_info [process_name ]["last_heart_beat" ] = time .time ()
186
-
187
170
# Transition from BUSY to ACKNOWLEDGED
188
171
childutils .listener .ok ()
189
172
@@ -198,15 +181,7 @@ def main(argv):
198
181
elapsed_mins = elapsed_secs // 60
199
182
process_under_alerting [process_name ]["last_alerted" ] = epoch_time
200
183
process_under_alerting [process_name ]["dead_minutes" ] += elapsed_mins
201
- generate_alerting_message (process_name , "not running" , process_under_alerting [process_name ]["dead_minutes" ])
202
-
203
- # Check whether we need write alerting messages into syslog
204
- for process in process_heart_beat_info .keys ():
205
- epoch_time = time .time ()
206
- elapsed_secs = epoch_time - process_heart_beat_info [process ]["last_heart_beat" ]
207
- if elapsed_secs >= ALERTING_INTERVAL_SECS :
208
- elapsed_mins = elapsed_secs // 60
209
- generate_alerting_message (process , "stuck" , elapsed_mins )
184
+ generate_alerting_message (process_name , process_under_alerting [process_name ]["dead_minutes" ])
210
185
211
186
if __name__ == "__main__" :
212
187
main (sys .argv [1 :])
0 commit comments