Skip to content

Commit 46cb307

Browse files
committed
Merge watchdog code to proc exit listener
1 parent ee1ab44 commit 46cb307

File tree

4 files changed

+96
-59
lines changed

4 files changed

+96
-59
lines changed

dockers/docker-orchagent/docker-init.j2

+1
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@ CFGGEN_PARAMS=" \
1818
-t /usr/share/sonic/templates/vlan_vars.j2 \
1919
-t /usr/share/sonic/templates/ndppd.conf.j2,/etc/ndppd.conf \
2020
-t /usr/share/sonic/templates/critical_processes.j2,/etc/supervisor/critical_processes \
21+
-t /usr/share/sonic/templates/watchdog_processes.j2,/etc/supervisor/watchdog_processes \
2122
-t /usr/share/sonic/templates/supervisord.conf.j2,/etc/supervisor/conf.d/supervisord.conf
2223
-t /usr/share/sonic/templates/wait_for_link.sh.j2,/usr/bin/wait_for_link.sh \
2324
"

dockers/docker-orchagent/supervisord.conf.j2

+1
Original file line numberDiff line numberDiff line change
@@ -75,6 +75,7 @@ command=/usr/bin/orchagent.sh
7575
priority=4
7676
autostart=false
7777
autorestart=false
78+
stdout_capture_maxbytes=1MB
7879
stdout_logfile=syslog
7980
stderr_logfile=syslog
8081
dependent_startup=true
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
program:orchagent

files/scripts/supervisor-proc-exit-listener

+93-59
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,16 @@ from collections import defaultdict
1313
from swsscommon import swsscommon
1414

1515
from supervisor import childutils
16+
from supervisor.events import EventTypes, getEventNameByType
17+
18+
PROCESS_COMMUNICATION_STDOUT = \
19+
getEventNameByType(EventTypes.PROCESS_COMMUNICATION_STDOUT)
20+
21+
# Each line of this file should specify one process, (as defined in supervisord.conf file), in the
22+
# following format:
23+
#
24+
# program:<process_name>
25+
WATCH_PROCESSES_FILE = '/etc/supervisor/watchdog_processes'
1626

1727
# Each line of this file should specify either one critical process or one
1828
# critical process group, (as defined in supervisord.conf file), in the
@@ -34,40 +44,40 @@ ALERTING_INTERVAL_SECS = 60
3444
EVENTS_PUBLISHER_SOURCE = "sonic-events-host"
3545
EVENTS_PUBLISHER_TAG = "process-exited-unexpectedly"
3646

37-
def get_critical_group_and_process_list():
47+
def get_group_and_process_list(process_file):
3848
"""
39-
@summary: Read the critical processes/group names from CRITICAL_PROCESSES_FILE.
49+
@summary: Read the critical processes/group names.
4050
@return: Two lists which contain critical processes and group names respectively.
4151
"""
42-
critical_group_list = []
43-
critical_process_list = []
52+
group_list = []
53+
process_list = []
4454

45-
with open(CRITICAL_PROCESSES_FILE, 'r') as file:
55+
with open(process_file, 'r') as file:
4656
for line in file:
4757
# ignore blank lines
4858
if re.match(r"^\s*$", line):
4959
continue
5060
line_info = line.strip(' \n').split(':')
5161
if len(line_info) != 2:
5262
syslog.syslog(syslog.LOG_ERR,
53-
"Syntax of the line {} in critical_processes file is incorrect. Exiting...".format(line))
63+
"Syntax of the line {} in processes file is incorrect. Exiting...".format(line))
5464
sys.exit(5)
5565

5666
identifier_key = line_info[0].strip()
5767
identifier_value = line_info[1].strip()
5868
if identifier_key == "group" and identifier_value:
59-
critical_group_list.append(identifier_value)
69+
group_list.append(identifier_value)
6070
elif identifier_key == "program" and identifier_value:
61-
critical_process_list.append(identifier_value)
71+
process_list.append(identifier_value)
6272
else:
6373
syslog.syslog(syslog.LOG_ERR,
64-
"Syntax of the line {} in critical_processes file is incorrect. Exiting...".format(line))
74+
"Syntax of the line {} in processes file is incorrect. Exiting...".format(line))
6575
sys.exit(6)
6676

67-
return critical_group_list, critical_process_list
77+
return group_list, process_list
6878

6979

70-
def generate_alerting_message(process_name, dead_minutes):
80+
def generate_alerting_message(process_name, status, dead_minutes):
7181
"""
7282
@summary: If a critical process was not running, this function will determine it resides in host
7383
or in a specific namespace. Then an alerting message will be written into syslog.
@@ -80,8 +90,8 @@ def generate_alerting_message(process_name, dead_minutes):
8090
else:
8191
namespace = namespace_prefix + namespace_id
8292

83-
syslog.syslog(syslog.LOG_ERR, "Process '{}' is not running in namespace '{}' ({} minutes)."
84-
.format(process_name, namespace, dead_minutes))
93+
syslog.syslog(syslog.LOG_ERR, "Process '{}' is {} in namespace '{}' ({} minutes)."
94+
.format(process_name, status, namespace, dead_minutes))
8595

8696

8797
def get_autorestart_state(container_name):
@@ -125,63 +135,87 @@ def main(argv):
125135
syslog.syslog(syslog.LOG_ERR, "Container name not specified. Exiting...")
126136
sys.exit(1)
127137

128-
critical_group_list, critical_process_list = get_critical_group_and_process_list()
138+
critical_group_list, critical_process_list = get_group_and_process_list(CRITICAL_PROCESSES_FILE)
139+
_, watch_process_list = get_group_and_process_list(WATCH_PROCESSES_FILE)
129140

130141
process_under_alerting = defaultdict(dict)
142+
process_heart_beat_info = defaultdict(dict)
131143
# Transition from ACKNOWLEDGED to READY
132144
childutils.listener.ready()
133145
events_handle = swsscommon.events_init_publisher(EVENTS_PUBLISHER_SOURCE)
134146
while True:
135-
file_descriptor_list = select.select([sys.stdin], [], [], SELECT_TIMEOUT_SECS)[0]
136-
if len(file_descriptor_list) > 0:
137-
line = file_descriptor_list[0].readline()
138-
headers = childutils.get_headers(line)
139-
payload = sys.stdin.read(int(headers['len']))
140-
141-
# Handle the PROCESS_STATE_EXITED event
142-
if headers['eventname'] == 'PROCESS_STATE_EXITED':
143-
payload_headers, payload_data = childutils.eventdata(payload + '\n')
144-
145-
expected = int(payload_headers['expected'])
146-
process_name = payload_headers['processname']
147-
group_name = payload_headers['groupname']
148-
149-
if (process_name in critical_process_list or group_name in critical_group_list) and expected == 0:
150-
is_auto_restart = get_autorestart_state(container_name)
151-
if is_auto_restart != "disabled":
152-
MSG_FORMAT_STR = "Process '{}' exited unexpectedly. Terminating supervisor '{}'"
153-
msg = MSG_FORMAT_STR.format(payload_headers['processname'], container_name)
154-
syslog.syslog(syslog.LOG_INFO, msg)
155-
publish_events(events_handle, payload_headers['processname'], container_name)
156-
swsscommon.events_deinit_publisher(events_handle)
157-
os.kill(os.getppid(), signal.SIGTERM)
158-
else:
159-
process_under_alerting[process_name]["last_alerted"] = time.time()
160-
process_under_alerting[process_name]["dead_minutes"] = 0
161-
162-
# Handle the PROCESS_STATE_RUNNING event
163-
elif headers['eventname'] == 'PROCESS_STATE_RUNNING':
164-
payload_headers, payload_data = childutils.eventdata(payload + '\n')
165-
process_name = payload_headers['processname']
166-
167-
if process_name in process_under_alerting:
168-
process_under_alerting.pop(process_name)
169-
147+
try:
148+
file_descriptor_list = select.select([sys.stdin], [], [], SELECT_TIMEOUT_SECS)[0]
149+
if len(file_descriptor_list) > 0:
150+
line = file_descriptor_list[0].readline()
151+
headers = childutils.get_headers(line)
152+
payload = sys.stdin.read(int(headers['len']))
153+
154+
# Handle the PROCESS_STATE_EXITED event
155+
if headers['eventname'] == 'PROCESS_STATE_EXITED':
156+
payload_headers, payload_data = childutils.eventdata(payload + '\n')
157+
158+
expected = int(payload_headers['expected'])
159+
process_name = payload_headers['processname']
160+
group_name = payload_headers['groupname']
161+
162+
if (process_name in critical_process_list or group_name in critical_group_list) and expected == 0:
163+
is_auto_restart = get_autorestart_state(container_name)
164+
if is_auto_restart != "disabled":
165+
MSG_FORMAT_STR = "Process '{}' exited unexpectedly. Terminating supervisor '{}'"
166+
msg = MSG_FORMAT_STR.format(payload_headers['processname'], container_name)
167+
syslog.syslog(syslog.LOG_INFO, msg)
168+
publish_events(events_handle, payload_headers['processname'], container_name)
169+
swsscommon.events_deinit_publisher(events_handle)
170+
os.kill(os.getppid(), signal.SIGTERM)
171+
else:
172+
process_under_alerting[process_name]["last_alerted"] = time.time()
173+
process_under_alerting[process_name]["dead_minutes"] = 0
174+
175+
# Handle the PROCESS_STATE_RUNNING event
176+
elif headers['eventname'] == 'PROCESS_STATE_RUNNING':
177+
payload_headers, payload_data = childutils.eventdata(payload + '\n')
178+
process_name = payload_headers['processname']
179+
180+
if process_name in process_under_alerting:
181+
process_under_alerting.pop(process_name)
182+
183+
# Handle the PROCESS_COMMUNICATION_STDOUT event
184+
elif headers['eventname'] == 'PROCESS_COMMUNICATION_STDOUT':
185+
payload_headers, payload_data = childutils.eventdata(payload + '\n')
186+
process_name = payload_headers['processname']
187+
188+
# update process heart beat time
189+
if (process_name in watch_process_list):
190+
process_heart_beat_info[process_name]["last_heart_beat"] = time.time()
191+
192+
193+
# Check whether we need write alerting messages into syslog
194+
for process_name in process_under_alerting.keys():
195+
epoch_time = time.time()
196+
elapsed_secs = epoch_time - process_under_alerting[process_name]["last_alerted"]
197+
if elapsed_secs >= ALERTING_INTERVAL_SECS:
198+
elapsed_mins = elapsed_secs // 60
199+
process_under_alerting[process_name]["last_alerted"] = epoch_time
200+
process_under_alerting[process_name]["dead_minutes"] += elapsed_mins
201+
generate_alerting_message(process_name, "not running", process_under_alerting[process_name]["dead_minutes"])
202+
203+
# Check whether we need write alerting messages into syslog
204+
for process in process_heart_beat_info.keys():
205+
epoch_time = time.time()
206+
elapsed_secs = epoch_time - process_heart_beat_info[process]["last_heart_beat"]
207+
if elapsed_secs >= ALERTING_INTERVAL_SECS:
208+
elapsed_mins = elapsed_secs // 60
209+
generate_alerting_message(process, "stuck", elapsed_mins)
210+
211+
except Exception as ex:
212+
syslog.syslog(syslog.LOG_ERR, "Exception: {}".format(ex))
213+
finally:
170214
# Transition from BUSY to ACKNOWLEDGED
171215
childutils.listener.ok()
172216

173217
# Transition from ACKNOWLEDGED to READY
174218
childutils.listener.ready()
175219

176-
# Check whether we need write alerting messages into syslog
177-
for process_name in process_under_alerting.keys():
178-
epoch_time = time.time()
179-
elapsed_secs = epoch_time - process_under_alerting[process_name]["last_alerted"]
180-
if elapsed_secs >= ALERTING_INTERVAL_SECS:
181-
elapsed_mins = elapsed_secs // 60
182-
process_under_alerting[process_name]["last_alerted"] = epoch_time
183-
process_under_alerting[process_name]["dead_minutes"] += elapsed_mins
184-
generate_alerting_message(process_name, process_under_alerting[process_name]["dead_minutes"])
185-
186220
if __name__ == "__main__":
187221
main(sys.argv[1:])

0 commit comments

Comments
 (0)