@@ -13,6 +13,16 @@ from collections import defaultdict
13
13
from swsscommon import swsscommon
14
14
15
15
from supervisor import childutils
16
+ from supervisor .events import EventTypes , getEventNameByType
17
+
18
+ PROCESS_COMMUNICATION_STDOUT = \
19
+ getEventNameByType (EventTypes .PROCESS_COMMUNICATION_STDOUT )
20
+
21
+ # Each line of this file should specify one process, (as defined in supervisord.conf file), in the
22
+ # following format:
23
+ #
24
+ # program:<process_name>
25
+ WATCH_PROCESSES_FILE = '/etc/supervisor/watchdog_processes'
16
26
17
27
# Each line of this file should specify either one critical process or one
18
28
# critical process group, (as defined in supervisord.conf file), in the
@@ -34,40 +44,40 @@ ALERTING_INTERVAL_SECS = 60
34
44
EVENTS_PUBLISHER_SOURCE = "sonic-events-host"
35
45
EVENTS_PUBLISHER_TAG = "process-exited-unexpectedly"
36
46
37
- def get_critical_group_and_process_list ( ):
47
+ def get_group_and_process_list ( process_file ):
38
48
"""
39
- @summary: Read the critical processes/group names from CRITICAL_PROCESSES_FILE .
49
+ @summary: Read the critical processes/group names.
40
50
@return: Two lists which contain critical processes and group names respectively.
41
51
"""
42
- critical_group_list = []
43
- critical_process_list = []
52
+ group_list = []
53
+ process_list = []
44
54
45
- with open (CRITICAL_PROCESSES_FILE , 'r' ) as file :
55
+ with open (process_file , 'r' ) as file :
46
56
for line in file :
47
57
# ignore blank lines
48
58
if re .match (r"^\s*$" , line ):
49
59
continue
50
60
line_info = line .strip (' \n ' ).split (':' )
51
61
if len (line_info ) != 2 :
52
62
syslog .syslog (syslog .LOG_ERR ,
53
- "Syntax of the line {} in critical_processes file is incorrect. Exiting..." .format (line ))
63
+ "Syntax of the line {} in processes file is incorrect. Exiting..." .format (line ))
54
64
sys .exit (5 )
55
65
56
66
identifier_key = line_info [0 ].strip ()
57
67
identifier_value = line_info [1 ].strip ()
58
68
if identifier_key == "group" and identifier_value :
59
- critical_group_list .append (identifier_value )
69
+ group_list .append (identifier_value )
60
70
elif identifier_key == "program" and identifier_value :
61
- critical_process_list .append (identifier_value )
71
+ process_list .append (identifier_value )
62
72
else :
63
73
syslog .syslog (syslog .LOG_ERR ,
64
- "Syntax of the line {} in critical_processes file is incorrect. Exiting..." .format (line ))
74
+ "Syntax of the line {} in processes file is incorrect. Exiting..." .format (line ))
65
75
sys .exit (6 )
66
76
67
- return critical_group_list , critical_process_list
77
+ return group_list , process_list
68
78
69
79
70
- def generate_alerting_message (process_name , dead_minutes ):
80
+ def generate_alerting_message (process_name , status , dead_minutes ):
71
81
"""
72
82
@summary: If a critical process was not running, this function will determine it resides in host
73
83
or in a specific namespace. Then an alerting message will be written into syslog.
@@ -80,8 +90,8 @@ def generate_alerting_message(process_name, dead_minutes):
80
90
else :
81
91
namespace = namespace_prefix + namespace_id
82
92
83
- syslog .syslog (syslog .LOG_ERR , "Process '{}' is not running in namespace '{}' ({} minutes)."
84
- .format (process_name , namespace , dead_minutes ))
93
+ syslog .syslog (syslog .LOG_ERR , "Process '{}' is {} in namespace '{}' ({} minutes)."
94
+ .format (process_name , status , namespace , dead_minutes ))
85
95
86
96
87
97
def get_autorestart_state (container_name ):
@@ -125,63 +135,87 @@ def main(argv):
125
135
syslog .syslog (syslog .LOG_ERR , "Container name not specified. Exiting..." )
126
136
sys .exit (1 )
127
137
128
- critical_group_list , critical_process_list = get_critical_group_and_process_list ()
138
+ critical_group_list , critical_process_list = get_group_and_process_list (CRITICAL_PROCESSES_FILE )
139
+ _ , watch_process_list = get_group_and_process_list (WATCH_PROCESSES_FILE )
129
140
130
141
process_under_alerting = defaultdict (dict )
142
+ process_heart_beat_info = defaultdict (dict )
131
143
# Transition from ACKNOWLEDGED to READY
132
144
childutils .listener .ready ()
133
145
events_handle = swsscommon .events_init_publisher (EVENTS_PUBLISHER_SOURCE )
134
146
while True :
135
- file_descriptor_list = select .select ([sys .stdin ], [], [], SELECT_TIMEOUT_SECS )[0 ]
136
- if len (file_descriptor_list ) > 0 :
137
- line = file_descriptor_list [0 ].readline ()
138
- headers = childutils .get_headers (line )
139
- payload = sys .stdin .read (int (headers ['len' ]))
140
-
141
- # Handle the PROCESS_STATE_EXITED event
142
- if headers ['eventname' ] == 'PROCESS_STATE_EXITED' :
143
- payload_headers , payload_data = childutils .eventdata (payload + '\n ' )
144
-
145
- expected = int (payload_headers ['expected' ])
146
- process_name = payload_headers ['processname' ]
147
- group_name = payload_headers ['groupname' ]
148
-
149
- if (process_name in critical_process_list or group_name in critical_group_list ) and expected == 0 :
150
- is_auto_restart = get_autorestart_state (container_name )
151
- if is_auto_restart != "disabled" :
152
- MSG_FORMAT_STR = "Process '{}' exited unexpectedly. Terminating supervisor '{}'"
153
- msg = MSG_FORMAT_STR .format (payload_headers ['processname' ], container_name )
154
- syslog .syslog (syslog .LOG_INFO , msg )
155
- publish_events (events_handle , payload_headers ['processname' ], container_name )
156
- swsscommon .events_deinit_publisher (events_handle )
157
- os .kill (os .getppid (), signal .SIGTERM )
158
- else :
159
- process_under_alerting [process_name ]["last_alerted" ] = time .time ()
160
- process_under_alerting [process_name ]["dead_minutes" ] = 0
161
-
162
- # Handle the PROCESS_STATE_RUNNING event
163
- elif headers ['eventname' ] == 'PROCESS_STATE_RUNNING' :
164
- payload_headers , payload_data = childutils .eventdata (payload + '\n ' )
165
- process_name = payload_headers ['processname' ]
166
-
167
- if process_name in process_under_alerting :
168
- process_under_alerting .pop (process_name )
169
-
147
+ try :
148
+ file_descriptor_list = select .select ([sys .stdin ], [], [], SELECT_TIMEOUT_SECS )[0 ]
149
+ if len (file_descriptor_list ) > 0 :
150
+ line = file_descriptor_list [0 ].readline ()
151
+ headers = childutils .get_headers (line )
152
+ payload = sys .stdin .read (int (headers ['len' ]))
153
+
154
+ # Handle the PROCESS_STATE_EXITED event
155
+ if headers ['eventname' ] == 'PROCESS_STATE_EXITED' :
156
+ payload_headers , payload_data = childutils .eventdata (payload + '\n ' )
157
+
158
+ expected = int (payload_headers ['expected' ])
159
+ process_name = payload_headers ['processname' ]
160
+ group_name = payload_headers ['groupname' ]
161
+
162
+ if (process_name in critical_process_list or group_name in critical_group_list ) and expected == 0 :
163
+ is_auto_restart = get_autorestart_state (container_name )
164
+ if is_auto_restart != "disabled" :
165
+ MSG_FORMAT_STR = "Process '{}' exited unexpectedly. Terminating supervisor '{}'"
166
+ msg = MSG_FORMAT_STR .format (payload_headers ['processname' ], container_name )
167
+ syslog .syslog (syslog .LOG_INFO , msg )
168
+ publish_events (events_handle , payload_headers ['processname' ], container_name )
169
+ swsscommon .events_deinit_publisher (events_handle )
170
+ os .kill (os .getppid (), signal .SIGTERM )
171
+ else :
172
+ process_under_alerting [process_name ]["last_alerted" ] = time .time ()
173
+ process_under_alerting [process_name ]["dead_minutes" ] = 0
174
+
175
+ # Handle the PROCESS_STATE_RUNNING event
176
+ elif headers ['eventname' ] == 'PROCESS_STATE_RUNNING' :
177
+ payload_headers , payload_data = childutils .eventdata (payload + '\n ' )
178
+ process_name = payload_headers ['processname' ]
179
+
180
+ if process_name in process_under_alerting :
181
+ process_under_alerting .pop (process_name )
182
+
183
+ # Handle the PROCESS_COMMUNICATION_STDOUT event
184
+ elif headers ['eventname' ] == 'PROCESS_COMMUNICATION_STDOUT' :
185
+ payload_headers , payload_data = childutils .eventdata (payload + '\n ' )
186
+ process_name = payload_headers ['processname' ]
187
+
188
+ # update process heart beat time
189
+ if (process_name in watch_process_list ):
190
+ process_heart_beat_info [process_name ]["last_heart_beat" ] = time .time ()
191
+
192
+
193
+ # Check whether we need write alerting messages into syslog
194
+ for process_name in process_under_alerting .keys ():
195
+ epoch_time = time .time ()
196
+ elapsed_secs = epoch_time - process_under_alerting [process_name ]["last_alerted" ]
197
+ if elapsed_secs >= ALERTING_INTERVAL_SECS :
198
+ elapsed_mins = elapsed_secs // 60
199
+ process_under_alerting [process_name ]["last_alerted" ] = epoch_time
200
+ process_under_alerting [process_name ]["dead_minutes" ] += elapsed_mins
201
+ generate_alerting_message (process_name , "not running" , process_under_alerting [process_name ]["dead_minutes" ])
202
+
203
+ # Check whether we need write alerting messages into syslog
204
+ for process in process_heart_beat_info .keys ():
205
+ epoch_time = time .time ()
206
+ elapsed_secs = epoch_time - process_heart_beat_info [process ]["last_heart_beat" ]
207
+ if elapsed_secs >= ALERTING_INTERVAL_SECS :
208
+ elapsed_mins = elapsed_secs // 60
209
+ generate_alerting_message (process , "stuck" , elapsed_mins )
210
+
211
+ except Exception as ex :
212
+ syslog .syslog (syslog .LOG_ERR , "Exception: {}" .format (ex ))
213
+ finally :
170
214
# Transition from BUSY to ACKNOWLEDGED
171
215
childutils .listener .ok ()
172
216
173
217
# Transition from ACKNOWLEDGED to READY
174
218
childutils .listener .ready ()
175
219
176
- # Check whether we need write alerting messages into syslog
177
- for process_name in process_under_alerting .keys ():
178
- epoch_time = time .time ()
179
- elapsed_secs = epoch_time - process_under_alerting [process_name ]["last_alerted" ]
180
- if elapsed_secs >= ALERTING_INTERVAL_SECS :
181
- elapsed_mins = elapsed_secs // 60
182
- process_under_alerting [process_name ]["last_alerted" ] = epoch_time
183
- process_under_alerting [process_name ]["dead_minutes" ] += elapsed_mins
184
- generate_alerting_message (process_name , process_under_alerting [process_name ]["dead_minutes" ])
185
-
186
220
if __name__ == "__main__" :
187
221
main (sys .argv [1 :])
0 commit comments