@@ -14,6 +14,12 @@ from swsscommon import swsscommon
14
14
15
15
from supervisor import childutils
16
16
17
+ # Each line of this file should specify one process, (as defined in supervisord.conf file), in the
18
+ # following format:
19
+ #
20
+ # program:<process_name>
21
+ WATCH_PROCESSES_FILE = '/etc/supervisor/watchdog_processes'
22
+
17
23
# Each line of this file should specify either one critical process or one
18
24
# critical process group, (as defined in supervisord.conf file), in the
19
25
# following format:
@@ -34,40 +40,40 @@ ALERTING_INTERVAL_SECS = 60
34
40
EVENTS_PUBLISHER_SOURCE = "sonic-events-host"
35
41
EVENTS_PUBLISHER_TAG = "process-exited-unexpectedly"
36
42
37
- def get_critical_group_and_process_list ( ):
43
+ def get_group_and_process_list ( process_file ):
38
44
"""
39
- @summary: Read the critical processes/group names from CRITICAL_PROCESSES_FILE .
45
+ @summary: Read the critical processes/group names.
40
46
@return: Two lists which contain critical processes and group names respectively.
41
47
"""
42
- critical_group_list = []
43
- critical_process_list = []
48
+ group_list = []
49
+ process_list = []
44
50
45
- with open (CRITICAL_PROCESSES_FILE , 'r' ) as file :
51
+ with open (process_file , 'r' ) as file :
46
52
for line in file :
47
53
# ignore blank lines
48
54
if re .match (r"^\s*$" , line ):
49
55
continue
50
56
line_info = line .strip (' \n ' ).split (':' )
51
57
if len (line_info ) != 2 :
52
58
syslog .syslog (syslog .LOG_ERR ,
53
- "Syntax of the line {} in critical_processes file is incorrect. Exiting..." .format (line ))
59
+ "Syntax of the line {} in processes file is incorrect. Exiting..." .format (line ))
54
60
sys .exit (5 )
55
61
56
62
identifier_key = line_info [0 ].strip ()
57
63
identifier_value = line_info [1 ].strip ()
58
64
if identifier_key == "group" and identifier_value :
59
- critical_group_list .append (identifier_value )
65
+ group_list .append (identifier_value )
60
66
elif identifier_key == "program" and identifier_value :
61
- critical_process_list .append (identifier_value )
67
+ process_list .append (identifier_value )
62
68
else :
63
69
syslog .syslog (syslog .LOG_ERR ,
64
- "Syntax of the line {} in critical_processes file is incorrect. Exiting..." .format (line ))
70
+ "Syntax of the line {} in processes file is incorrect. Exiting..." .format (line ))
65
71
sys .exit (6 )
66
72
67
- return critical_group_list , critical_process_list
73
+ return group_list , process_list
68
74
69
75
70
- def generate_alerting_message (process_name , dead_minutes ):
76
+ def generate_alerting_message (process_name , status , dead_minutes ):
71
77
"""
72
78
@summary: If a critical process was not running, this function will determine it resides in host
73
79
or in a specific namespace. Then an alerting message will be written into syslog.
@@ -80,8 +86,8 @@ def generate_alerting_message(process_name, dead_minutes):
80
86
else :
81
87
namespace = namespace_prefix + namespace_id
82
88
83
- syslog .syslog (syslog .LOG_ERR , "Process '{}' is not running in namespace '{}' ({} minutes)."
84
- .format (process_name , namespace , dead_minutes ))
89
+ syslog .syslog (syslog .LOG_ERR , "Process '{}' is {} in namespace '{}' ({} minutes)."
90
+ .format (process_name , status , namespace , dead_minutes ))
85
91
86
92
87
93
def get_autorestart_state (container_name ):
@@ -125,9 +131,15 @@ def main(argv):
125
131
syslog .syslog (syslog .LOG_ERR , "Container name not specified. Exiting..." )
126
132
sys .exit (1 )
127
133
128
- critical_group_list , critical_process_list = get_critical_group_and_process_list ()
134
+ critical_group_list , critical_process_list = get_group_and_process_list (CRITICAL_PROCESSES_FILE )
135
+
136
+ # WATCH_PROCESSES_FILE is optional
137
+ watch_process_list = []
138
+ if os .path .exists (WATCH_PROCESSES_FILE ):
139
+ _ , watch_process_list = get_group_and_process_list (WATCH_PROCESSES_FILE )
129
140
130
141
process_under_alerting = defaultdict (dict )
142
+ process_heart_beat_info = defaultdict (dict )
131
143
# Transition from ACKNOWLEDGED to READY
132
144
childutils .listener .ready ()
133
145
events_handle = swsscommon .events_init_publisher (EVENTS_PUBLISHER_SOURCE )
@@ -167,6 +179,15 @@ def main(argv):
167
179
if process_name in process_under_alerting :
168
180
process_under_alerting .pop (process_name )
169
181
182
+ # Handle the PROCESS_COMMUNICATION_STDOUT event
183
+ elif headers ['eventname' ] == 'PROCESS_COMMUNICATION_STDOUT' :
184
+ payload_headers , payload_data = childutils .eventdata (payload + '\n ' )
185
+ process_name = payload_headers ['processname' ]
186
+
187
+ # update process heart beat time
188
+ if (process_name in watch_process_list ):
189
+ process_heart_beat_info [process_name ]["last_heart_beat" ] = time .time ()
190
+
170
191
# Transition from BUSY to ACKNOWLEDGED
171
192
childutils .listener .ok ()
172
193
@@ -181,7 +202,15 @@ def main(argv):
181
202
elapsed_mins = elapsed_secs // 60
182
203
process_under_alerting [process_name ]["last_alerted" ] = epoch_time
183
204
process_under_alerting [process_name ]["dead_minutes" ] += elapsed_mins
184
- generate_alerting_message (process_name , process_under_alerting [process_name ]["dead_minutes" ])
205
+ generate_alerting_message (process_name , "not running" , process_under_alerting [process_name ]["dead_minutes" ])
206
+
207
+ # Check whether we need write alerting messages into syslog
208
+ for process in process_heart_beat_info .keys ():
209
+ epoch_time = time .time ()
210
+ elapsed_secs = epoch_time - process_heart_beat_info [process ]["last_heart_beat" ]
211
+ if elapsed_secs >= ALERTING_INTERVAL_SECS :
212
+ elapsed_mins = elapsed_secs // 60
213
+ generate_alerting_message (process , "stuck" , elapsed_mins )
185
214
186
215
if __name__ == "__main__" :
187
- main (sys .argv [1 :])
216
+ main (sys .argv [1 :])
0 commit comments