@@ -14,6 +14,12 @@ from swsscommon import swsscommon
14
14
15
15
from supervisor import childutils
16
16
17
+ # Each line of this file should specify one process, (as defined in supervisord.conf file), in the
18
+ # following format:
19
+ #
20
+ # program:<process_name>
21
+ WATCH_PROCESSES_FILE = '/etc/supervisor/watchdog_processes'
22
+
17
23
# Each line of this file should specify either one critical process or one
18
24
# critical process group, (as defined in supervisord.conf file), in the
19
25
# following format:
@@ -34,40 +40,40 @@ ALERTING_INTERVAL_SECS = 60
34
40
EVENTS_PUBLISHER_SOURCE = "sonic-events-host"
35
41
EVENTS_PUBLISHER_TAG = "process-exited-unexpectedly"
36
42
37
- def get_critical_group_and_process_list ( ):
43
+ def get_group_and_process_list ( process_file ):
38
44
"""
39
- @summary: Read the critical processes/group names from CRITICAL_PROCESSES_FILE .
45
+ @summary: Read the critical processes/group names.
40
46
@return: Two lists which contain critical processes and group names respectively.
41
47
"""
42
- critical_group_list = []
43
- critical_process_list = []
48
+ group_list = []
49
+ process_list = []
44
50
45
- with open (CRITICAL_PROCESSES_FILE , 'r' ) as file :
51
+ with open (process_file , 'r' ) as file :
46
52
for line in file :
47
53
# ignore blank lines
48
54
if re .match (r"^\s*$" , line ):
49
55
continue
50
56
line_info = line .strip (' \n ' ).split (':' )
51
57
if len (line_info ) != 2 :
52
58
syslog .syslog (syslog .LOG_ERR ,
53
- "Syntax of the line {} in critical_processes file is incorrect. Exiting..." .format (line ))
59
+ "Syntax of the line {} in processes file is incorrect. Exiting..." .format (line ))
54
60
sys .exit (5 )
55
61
56
62
identifier_key = line_info [0 ].strip ()
57
63
identifier_value = line_info [1 ].strip ()
58
64
if identifier_key == "group" and identifier_value :
59
- critical_group_list .append (identifier_value )
65
+ group_list .append (identifier_value )
60
66
elif identifier_key == "program" and identifier_value :
61
- critical_process_list .append (identifier_value )
67
+ process_list .append (identifier_value )
62
68
else :
63
69
syslog .syslog (syslog .LOG_ERR ,
64
- "Syntax of the line {} in critical_processes file is incorrect. Exiting..." .format (line ))
70
+ "Syntax of the line {} in processes file is incorrect. Exiting..." .format (line ))
65
71
sys .exit (6 )
66
72
67
- return critical_group_list , critical_process_list
73
+ return group_list , process_list
68
74
69
75
70
- def generate_alerting_message (process_name , dead_minutes ):
76
+ def generate_alerting_message (process_name , status , dead_minutes ):
71
77
"""
72
78
@summary: If a critical process was not running, this function will determine it resides in host
73
79
or in a specific namespace. Then an alerting message will be written into syslog.
@@ -80,8 +86,8 @@ def generate_alerting_message(process_name, dead_minutes):
80
86
else :
81
87
namespace = namespace_prefix + namespace_id
82
88
83
- syslog .syslog (syslog .LOG_ERR , "Process '{}' is not running in namespace '{}' ({} minutes)."
84
- .format (process_name , namespace , dead_minutes ))
89
+ syslog .syslog (syslog .LOG_ERR , "Process '{}' is {} in namespace '{}' ({} minutes)."
90
+ .format (process_name , status , namespace , dead_minutes ))
85
91
86
92
87
93
def get_autorestart_state (container_name ):
@@ -125,9 +131,11 @@ def main(argv):
125
131
syslog .syslog (syslog .LOG_ERR , "Container name not specified. Exiting..." )
126
132
sys .exit (1 )
127
133
128
- critical_group_list , critical_process_list = get_critical_group_and_process_list ()
134
+ critical_group_list , critical_process_list = get_group_and_process_list (CRITICAL_PROCESSES_FILE )
135
+ _ , watch_process_list = get_group_and_process_list (WATCH_PROCESSES_FILE )
129
136
130
137
process_under_alerting = defaultdict (dict )
138
+ process_heart_beat_info = defaultdict (dict )
131
139
# Transition from ACKNOWLEDGED to READY
132
140
childutils .listener .ready ()
133
141
events_handle = swsscommon .events_init_publisher (EVENTS_PUBLISHER_SOURCE )
@@ -167,6 +175,15 @@ def main(argv):
167
175
if process_name in process_under_alerting :
168
176
process_under_alerting .pop (process_name )
169
177
178
+ # Handle the PROCESS_COMMUNICATION_STDOUT event
179
+ elif headers ['eventname' ] == 'PROCESS_COMMUNICATION_STDOUT' :
180
+ payload_headers , payload_data = childutils .eventdata (payload + '\n ' )
181
+ process_name = payload_headers ['processname' ]
182
+
183
+ # update process heart beat time
184
+ if (process_name in watch_process_list ):
185
+ process_heart_beat_info [process_name ]["last_heart_beat" ] = time .time ()
186
+
170
187
# Transition from BUSY to ACKNOWLEDGED
171
188
childutils .listener .ok ()
172
189
@@ -181,7 +198,15 @@ def main(argv):
181
198
elapsed_mins = elapsed_secs // 60
182
199
process_under_alerting [process_name ]["last_alerted" ] = epoch_time
183
200
process_under_alerting [process_name ]["dead_minutes" ] += elapsed_mins
184
- generate_alerting_message (process_name , process_under_alerting [process_name ]["dead_minutes" ])
201
+ generate_alerting_message (process_name , "not running" , process_under_alerting [process_name ]["dead_minutes" ])
202
+
203
+ # Check whether we need write alerting messages into syslog
204
+ for process in process_heart_beat_info .keys ():
205
+ epoch_time = time .time ()
206
+ elapsed_secs = epoch_time - process_heart_beat_info [process ]["last_heart_beat" ]
207
+ if elapsed_secs >= ALERTING_INTERVAL_SECS :
208
+ elapsed_mins = elapsed_secs // 60
209
+ generate_alerting_message (process , "stuck" , elapsed_mins )
185
210
186
211
if __name__ == "__main__" :
187
212
main (sys .argv [1 :])
0 commit comments