Skip to content

Commit 44427a2

Browse files
authored
Add watchdog mechanism to swss service and generate alert when swss have issue. (sonic-net#14686)
This PR depends on sonic-net/sonic-swss#2737 merge first. **What I did** Add orchagent watchdog to monitor and alert orchagent stuck issue. **Why I did it** Currently SONiC monit system only monit orchagent process exist or not. If orchagent process stuck and stop processing, current monit can't find and report it. **How I verified it** Pass all UT. Add new UT sonic-net/sonic-mgmt#8306 to check watchdog works correctly. Manually test, after pause orchagent with 'kill -STOP <pid>', check there are warning message exist in log: Apr 28 23:36:41.504923 vlab-01 ERR swss#supervisor-proc-watchdog-listener: Process 'orchagent' is stuck in namespace 'host' (1.0 minutes). **Details if related** Heartbeat message PR: sonic-net/sonic-swss#2737 UT PR: sonic-net/sonic-mgmt#8306
1 parent 381cfe4 commit 44427a2

File tree

4 files changed

+44
-16
lines changed

4 files changed

+44
-16
lines changed

dockers/docker-orchagent/docker-init.j2

+1
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@ CFGGEN_PARAMS=" \
1818
-t /usr/share/sonic/templates/vlan_vars.j2 \
1919
-t /usr/share/sonic/templates/ndppd.conf.j2,/etc/ndppd.conf \
2020
-t /usr/share/sonic/templates/critical_processes.j2,/etc/supervisor/critical_processes \
21+
-t /usr/share/sonic/templates/watchdog_processes.j2,/etc/supervisor/watchdog_processes \
2122
-t /usr/share/sonic/templates/supervisord.conf.j2,/etc/supervisor/conf.d/supervisord.conf
2223
-t /usr/share/sonic/templates/wait_for_link.sh.j2,/usr/bin/wait_for_link.sh \
2324
"

dockers/docker-orchagent/supervisord.conf.j2

+2-1
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@ buffer_size=1024
1414

1515
[eventlistener:supervisor-proc-exit-listener]
1616
command=/usr/bin/supervisor-proc-exit-listener --container-name swss
17-
events=PROCESS_STATE_EXITED,PROCESS_STATE_RUNNING
17+
events=PROCESS_STATE_EXITED,PROCESS_STATE_RUNNING,PROCESS_COMMUNICATION_STDOUT
1818
autostart=true
1919
autorestart=unexpected
2020
buffer_size=1024
@@ -75,6 +75,7 @@ command=/usr/bin/orchagent.sh
7575
priority=4
7676
autostart=false
7777
autorestart=false
78+
stdout_capture_maxbytes=1MB
7879
stdout_logfile=syslog
7980
stderr_logfile=syslog
8081
dependent_startup=true
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
program:orchagent

files/scripts/supervisor-proc-exit-listener

+40-15
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,12 @@ from swsscommon import swsscommon
1414

1515
from supervisor import childutils
1616

17+
# Each line of this file should specify one process, (as defined in supervisord.conf file), in the
18+
# following format:
19+
#
20+
# program:<process_name>
21+
WATCH_PROCESSES_FILE = '/etc/supervisor/watchdog_processes'
22+
1723
# Each line of this file should specify either one critical process or one
1824
# critical process group, (as defined in supervisord.conf file), in the
1925
# following format:
@@ -34,40 +40,40 @@ ALERTING_INTERVAL_SECS = 60
3440
EVENTS_PUBLISHER_SOURCE = "sonic-events-host"
3541
EVENTS_PUBLISHER_TAG = "process-exited-unexpectedly"
3642

37-
def get_critical_group_and_process_list():
43+
def get_group_and_process_list(process_file):
3844
"""
39-
@summary: Read the critical processes/group names from CRITICAL_PROCESSES_FILE.
45+
@summary: Read the critical processes/group names.
4046
@return: Two lists which contain critical processes and group names respectively.
4147
"""
42-
critical_group_list = []
43-
critical_process_list = []
48+
group_list = []
49+
process_list = []
4450

45-
with open(CRITICAL_PROCESSES_FILE, 'r') as file:
51+
with open(process_file, 'r') as file:
4652
for line in file:
4753
# ignore blank lines
4854
if re.match(r"^\s*$", line):
4955
continue
5056
line_info = line.strip(' \n').split(':')
5157
if len(line_info) != 2:
5258
syslog.syslog(syslog.LOG_ERR,
53-
"Syntax of the line {} in critical_processes file is incorrect. Exiting...".format(line))
59+
"Syntax of the line {} in processes file is incorrect. Exiting...".format(line))
5460
sys.exit(5)
5561

5662
identifier_key = line_info[0].strip()
5763
identifier_value = line_info[1].strip()
5864
if identifier_key == "group" and identifier_value:
59-
critical_group_list.append(identifier_value)
65+
group_list.append(identifier_value)
6066
elif identifier_key == "program" and identifier_value:
61-
critical_process_list.append(identifier_value)
67+
process_list.append(identifier_value)
6268
else:
6369
syslog.syslog(syslog.LOG_ERR,
64-
"Syntax of the line {} in critical_processes file is incorrect. Exiting...".format(line))
70+
"Syntax of the line {} in processes file is incorrect. Exiting...".format(line))
6571
sys.exit(6)
6672

67-
return critical_group_list, critical_process_list
73+
return group_list, process_list
6874

6975

70-
def generate_alerting_message(process_name, dead_minutes):
76+
def generate_alerting_message(process_name, status, dead_minutes):
7177
"""
7278
@summary: If a critical process was not running, this function will determine it resides in host
7379
or in a specific namespace. Then an alerting message will be written into syslog.
@@ -80,8 +86,8 @@ def generate_alerting_message(process_name, dead_minutes):
8086
else:
8187
namespace = namespace_prefix + namespace_id
8288

83-
syslog.syslog(syslog.LOG_ERR, "Process '{}' is not running in namespace '{}' ({} minutes)."
84-
.format(process_name, namespace, dead_minutes))
89+
syslog.syslog(syslog.LOG_ERR, "Process '{}' is {} in namespace '{}' ({} minutes)."
90+
.format(process_name, status, namespace, dead_minutes))
8591

8692

8793
def get_autorestart_state(container_name):
@@ -125,9 +131,11 @@ def main(argv):
125131
syslog.syslog(syslog.LOG_ERR, "Container name not specified. Exiting...")
126132
sys.exit(1)
127133

128-
critical_group_list, critical_process_list = get_critical_group_and_process_list()
134+
critical_group_list, critical_process_list = get_group_and_process_list(CRITICAL_PROCESSES_FILE)
135+
_, watch_process_list = get_group_and_process_list(WATCH_PROCESSES_FILE)
129136

130137
process_under_alerting = defaultdict(dict)
138+
process_heart_beat_info = defaultdict(dict)
131139
# Transition from ACKNOWLEDGED to READY
132140
childutils.listener.ready()
133141
events_handle = swsscommon.events_init_publisher(EVENTS_PUBLISHER_SOURCE)
@@ -167,6 +175,15 @@ def main(argv):
167175
if process_name in process_under_alerting:
168176
process_under_alerting.pop(process_name)
169177

178+
# Handle the PROCESS_COMMUNICATION_STDOUT event
179+
elif headers['eventname'] == 'PROCESS_COMMUNICATION_STDOUT':
180+
payload_headers, payload_data = childutils.eventdata(payload + '\n')
181+
process_name = payload_headers['processname']
182+
183+
# update process heart beat time
184+
if (process_name in watch_process_list):
185+
process_heart_beat_info[process_name]["last_heart_beat"] = time.time()
186+
170187
# Transition from BUSY to ACKNOWLEDGED
171188
childutils.listener.ok()
172189

@@ -181,7 +198,15 @@ def main(argv):
181198
elapsed_mins = elapsed_secs // 60
182199
process_under_alerting[process_name]["last_alerted"] = epoch_time
183200
process_under_alerting[process_name]["dead_minutes"] += elapsed_mins
184-
generate_alerting_message(process_name, process_under_alerting[process_name]["dead_minutes"])
201+
generate_alerting_message(process_name, "not running", process_under_alerting[process_name]["dead_minutes"])
202+
203+
# Check whether we need write alerting messages into syslog
204+
for process in process_heart_beat_info.keys():
205+
epoch_time = time.time()
206+
elapsed_secs = epoch_time - process_heart_beat_info[process]["last_heart_beat"]
207+
if elapsed_secs >= ALERTING_INTERVAL_SECS:
208+
elapsed_mins = elapsed_secs // 60
209+
generate_alerting_message(process, "stuck", elapsed_mins)
185210

186211
if __name__ == "__main__":
187212
main(sys.argv[1:])

0 commit comments

Comments
 (0)